253 files changed, 42793 insertions, 11001 deletions
diff --git a/llvm/test/Analysis/CostModel/RISCV/stepvector.ll b/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
index 7d29d2c..e599955 100644
--- a/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/stepvector.ll
@@ -4,98 +4,60 @@
 
 define void @stepvector() {
 ; CHECK-LABEL: 'stepvector'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %zero = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %24 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %29 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %30 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %34 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %35 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %39 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %41 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %44 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %7 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %8 = call <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %13 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %14 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %15 = call <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %20 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %21 = call <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %25 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %26 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %zero = call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
-  %1 = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-  %2 = call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
-  %3 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %4 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %5 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %6 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %7 = call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
-  %8 = call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
-  %9 = call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
-  %10 = call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
-  %11 = call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
-  %12 = call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
-  %13 = call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
-  %14 = call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
-  %15 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %16 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %17 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %18 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %19 = call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
-  %20 = call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
-  %21 = call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
-  %22 = call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
-  %23 = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-  %24 = call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
-  %25 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %26 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %27 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %28 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %29 = call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
-  %30 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %31 = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %32 = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-  %33 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %34 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %35 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %36 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %37 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %38 = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-  %39 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %40 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %41 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %42 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %43 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
-  %44 = call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
+  call <vscale x 1 x i8> @llvm.experimental.stepvector.nxv1i8()
+  call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+  call <vscale x 4 x i8> @llvm.experimental.stepvector.nxv4i8()
+  call <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
+  call <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
+  call <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
+  call <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+  call <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
+  call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
+  call <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
+  call <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
+  call <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
+  call <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
+  call <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+  call <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
+  call <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
+  call <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
+  call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+  call <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
+  call <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+  call <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
+  call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+  call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+  call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+  call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+  call <vscale x 16 x i64> @llvm.experimental.stepvector.nxv16i64()
   ret void
 }
 
@@ -107,17 +69,20 @@ declare <vscale x 8 x i8> @llvm.experimental.stepvector.nxv8i8()
 declare <vscale x 16 x i8> @llvm.experimental.stepvector.nxv16i8()
 declare <vscale x 32 x i8> @llvm.experimental.stepvector.nxv32i8()
 declare <vscale x 64 x i8> @llvm.experimental.stepvector.nxv64i8()
+declare <vscale x 128 x i8> @llvm.experimental.stepvector.nxv128i8()
 declare <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
 declare <vscale x 2 x i16> @llvm.experimental.stepvector.nxv2i16()
 declare <vscale x 4 x i16> @llvm.experimental.stepvector.nxv4i16()
 declare <vscale x 8 x i16> @llvm.experimental.stepvector.nxv8i16()
 declare <vscale x 16 x i16> @llvm.experimental.stepvector.nxv16i16()
 declare <vscale x 32 x i16> @llvm.experimental.stepvector.nxv32i16()
+declare <vscale x 64 x i16> @llvm.experimental.stepvector.nxv64i16()
 declare <vscale x 1 x i32> @llvm.experimental.stepvector.nxv1i32()
 declare <vscale x 2 x i32> @llvm.experimental.stepvector.nxv2i32()
 declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 declare <vscale x 8 x i32> @llvm.experimental.stepvector.nxv8i32()
 declare <vscale x 16 x i32> @llvm.experimental.stepvector.nxv16i32()
+declare <vscale x 32 x i32> @llvm.experimental.stepvector.nxv32i32()
 declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
 declare <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
 declare <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 64ed9bed..47487d6 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -35,7 +35,7 @@ define i32 @add(i32 %arg) {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %B = sext <4 x i1> undef to <4 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %C = trunc <4 x i32> undef to <4 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %D = zext <8 x i1> undef to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %E = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %E = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %F = trunc <8 x i32> undef to <8 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %G = zext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %H = trunc i32 undef to i1
@@ -143,7 +143,7 @@ define i32 @zext_sext(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'zext_sext'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = zext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = sext <16 x i8> undef to <16 x i16>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i16> undef to <8 x i32>
@@ -343,7 +343,7 @@ define i32 @masks8(<8 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks8'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <8 x i1> %in to <8 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %S = sext <8 x i1> %in to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <8 x i1> %in to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks8'
@@ -374,7 +374,7 @@ define i32 @masks4(<4 x i1> %in) {
 ;
 ; AVX1-LABEL: 'masks4'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %Z = zext <4 x i1> %in to <4 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %S = sext <4 x i1> %in to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %S = sext <4 x i1> %in to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'masks4'
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 01efced..4a2585a 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -1962,7 +1962,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-LABEL: 'sext_vXi1'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -1971,7 +1971,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
@@ -2242,7 +2242,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-LABEL: 'sext_vXi1'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sext i1 undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i1> undef to <2 x i64>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i1> undef to <4 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i64 = sext <8 x i1> undef to <8 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i64 = sext <16 x i1> undef to <16 x i64>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32i64 = sext <32 x i1> undef to <32 x i64>
@@ -2251,7 +2251,7 @@ define i32 @sext_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sext i1 undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i1> undef to <2 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sext <4 x i1> undef to <4 x i32>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i1> undef to <8 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sext <16 x i1> undef to <16 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32i32 = sext <32 x i1> undef to <32 x i32>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V64i32 = sext <64 x i1> undef to <64 x i32>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
new file mode 100644
index 0000000..55fdaaf
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-codesize.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index 897344d..ad56c28 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 5f22b2e..c7e7c46 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE2
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=SSE42
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mattr=+avx2 -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,AVX2
-;
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skylake -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX,SKL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=knl -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,KNL
-; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -mcpu=skx -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s --check-prefixes=AVX512,SKX
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
 
 define i32 @masked_load() {
 ; SSE2-LABEL: 'masked_load'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
new file mode 100644
index 0000000..edb05ad
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-latency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
new file mode 100644
index 0000000..3ebd9cc
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-sizelatency.ll
@@ -0,0 +1,2413 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2   | FileCheck %s --check-prefixes=SSE2
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx    | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2   | FileCheck %s --check-prefixes=AVX,AVX2
+;
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skylake  | FileCheck %s --check-prefixes=AVX,SKL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=knl      | FileCheck %s --check-prefixes=AVX512,KNL
+; RUN: opt < %s -S -mtriple=x86_64-apple-darwin -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=skx      | FileCheck %s --check-prefixes=AVX512,SKX
+
+define i32 @masked_load() {
+; SSE2-LABEL: 'masked_load'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 89 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_load'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_load'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_load'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_load'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x double> undef)
+  %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x double> undef)
+  %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x float> undef)
+  %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x float> undef)
+  %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x float> undef)
+  %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x float> undef)
+  %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x float> undef)
+  %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x float> undef)
+  %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x float> undef)
+  %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x float> undef)
+  %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x float> undef)
+  %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i64> undef)
+  %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i64> undef)
+  %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> undef, <15 x i32> undef)
+  %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> undef, <14 x i32> undef)
+  %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> undef, <13 x i32> undef)
+  %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> undef, <12 x i32> undef)
+  %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> undef, <11 x i32> undef)
+  %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> undef, <10 x i32> undef)
+  %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> undef, <9 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> undef, <7 x i32> undef)
+  %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> undef, <6 x i32> undef)
+  %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> undef, <5 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> undef, <3 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+  %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> undef, <1 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_store() {
+; SSE2-LABEL: 'masked_store'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 91 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 107 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 86 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 190 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 440 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 220 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_store'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_store'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 163 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_store'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 164 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 326 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_store'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> undef)
+  call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> undef)
+  call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> undef)
+  call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> undef)
+  call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> undef)
+  call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> undef)
+  call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> undef)
+  call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> undef)
+  call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> undef)
+  call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> undef)
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> undef)
+  call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_gather() {
+; SSE2-LABEL: 'masked_gather'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_gather'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_gather'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_gather'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_gather'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_gather'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_gather'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_scatter() {
+; SSE2-LABEL: 'masked_scatter'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_scatter'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_scatter'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; KNL-LABEL: 'masked_scatter'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKX-LABEL: 'masked_scatter'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+  call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+
+  call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
+
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
+  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
+  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
+  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_expandload() {
+; SSE2-LABEL: 'masked_expandload'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_expandload'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX-LABEL: 'masked_expandload'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_expandload'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  %V8F64 = call <8 x double> @llvm.masked.expandload.v8f64(ptr undef, <8 x i1> undef, <8 x double> undef)
+  %V4F64 = call <4 x double> @llvm.masked.expandload.v4f64(ptr undef, <4 x i1> undef, <4 x double> undef)
+  %V2F64 = call <2 x double> @llvm.masked.expandload.v2f64(ptr undef, <2 x i1> undef, <2 x double> undef)
+  %V1F64 = call <1 x double> @llvm.masked.expandload.v1f64(ptr undef, <1 x i1> undef, <1 x double> undef)
+
+  %V16F32 = call <16 x float> @llvm.masked.expandload.v16f32(ptr undef, <16 x i1> undef, <16 x float> undef)
+  %V8F32 = call <8 x float> @llvm.masked.expandload.v8f32(ptr undef, <8 x i1> undef, <8 x float> undef)
+  %V4F32 = call <4 x float> @llvm.masked.expandload.v4f32(ptr undef, <4 x i1> undef, <4 x float> undef)
+  %V2F32 = call <2 x float> @llvm.masked.expandload.v2f32(ptr undef, <2 x i1> undef, <2 x float> undef)
+
+  %V8I64 = call <8 x i64> @llvm.masked.expandload.v8i64(ptr undef, <8 x i1> undef, <8 x i64> undef)
+  %V4I64 = call <4 x i64> @llvm.masked.expandload.v4i64(ptr undef, <4 x i1> undef, <4 x i64> undef)
+  %V2I64 = call <2 x i64> @llvm.masked.expandload.v2i64(ptr undef, <2 x i1> undef, <2 x i64> undef)
+  %V1I64 = call <1 x i64> @llvm.masked.expandload.v1i64(ptr undef, <1 x i1> undef, <1 x i64> undef)
+
+  %V16I32 = call <16 x i32> @llvm.masked.expandload.v16i32(ptr undef, <16 x i1> undef, <16 x i32> undef)
+  %V8I32 = call <8 x i32> @llvm.masked.expandload.v8i32(ptr undef, <8 x i1> undef, <8 x i32> undef)
+  %V4I32 = call <4 x i32> @llvm.masked.expandload.v4i32(ptr undef, <4 x i1> undef, <4 x i32> undef)
+  %V2I32 = call <2 x i32> @llvm.masked.expandload.v2i32(ptr undef, <2 x i1> undef, <2 x i32> undef)
+
+  %V32I16 = call <32 x i16> @llvm.masked.expandload.v32i16(ptr undef, <32 x i1> undef, <32 x i16> undef)
+  %V16I16 = call <16 x i16> @llvm.masked.expandload.v16i16(ptr undef, <16 x i1> undef, <16 x i16> undef)
+  %V8I16 = call <8 x i16> @llvm.masked.expandload.v8i16(ptr undef, <8 x i1> undef, <8 x i16> undef)
+  %V4I16 = call <4 x i16> @llvm.masked.expandload.v4i16(ptr undef, <4 x i1> undef, <4 x i16> undef)
+
+  %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(ptr undef, <64 x i1> undef, <64 x i8> undef)
+  %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(ptr undef, <32 x i1> undef, <32 x i8> undef)
+  %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(ptr undef, <16 x i1> undef, <16 x i8> undef)
+  %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(ptr undef, <8 x i1> undef, <8 x i8> undef)
+
+  ret i32 0
+}
+
+define i32 @masked_compressstore() {
+; SSE2-LABEL: 'masked_compressstore'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SSE42-LABEL: 'masked_compressstore'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX1-LABEL: 'masked_compressstore'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX2-LABEL: 'masked_compressstore'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; SKL-LABEL: 'masked_compressstore'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+; AVX512-LABEL: 'masked_compressstore'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 195 for instruction: call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
+;
+  call void @llvm.masked.compressstore.v8f64(<8 x double> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f64(<4 x double> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f64(<2 x double> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1f64(<1 x double> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16f32(<16 x float> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8f32(<8 x float> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4f32(<4 x float> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2f32(<2 x float> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v8i64(<8 x i64> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i64(<4 x i64> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i64(<2 x i64> undef, ptr undef, <2 x i1> undef)
+  call void @llvm.masked.compressstore.v1i64(<1 x i64> undef, ptr undef, <1 x i1> undef)
+
+  call void @llvm.masked.compressstore.v16i32(<16 x i32> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i32(<8 x i32> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i32(<4 x i32> undef, ptr undef, <4 x i1> undef)
+  call void @llvm.masked.compressstore.v2i32(<2 x i32> undef, ptr undef, <2 x i1> undef)
+
+  call void @llvm.masked.compressstore.v32i16(<32 x i16> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i16(<16 x i16> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i16(<8 x i16> undef, ptr undef, <8 x i1> undef)
+  call void @llvm.masked.compressstore.v4i16(<4 x i16> undef, ptr undef, <4 x i1> undef)
+
+  call void @llvm.masked.compressstore.v64i8(<64 x i8> undef, ptr undef, <64 x i1> undef)
+  call void @llvm.masked.compressstore.v32i8(<32 x i8> undef, ptr undef, <32 x i1> undef)
+  call void @llvm.masked.compressstore.v16i8(<16 x i8> undef, ptr undef, <16 x i1> undef)
+  call void @llvm.masked.compressstore.v8i8(<8 x i8> undef, ptr undef, <8 x i1> undef)
+
+  ret i32 0
+}
+
+define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
+; SSE2-LABEL: 'test1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX-LABEL: 'test1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test1'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
+; SSE2-LABEL: 'test2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX-LABEL: 'test2'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX512-LABEL: 'test2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+  ret <4 x i32> %res
+}
+
+define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
+; SSE2-LABEL: 'test3'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test3'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test3'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test3'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v4i32.p0(<4 x i32>%val, ptr %addr, i32 4, <4 x i1>%mask)
+  ret void
+}
+
+define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
+; SSE2-LABEL: 'test4'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SSE42-LABEL: 'test4'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX1-LABEL: 'test4'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX2-LABEL: 'test4'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; SKL-LABEL: 'test4'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+; AVX512-LABEL: 'test4'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x float> %res
+;
+  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
+  %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1>%mask, <8 x float>%dst)
+  ret <8 x float> %res
+}
+
+define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
+; SSE2-LABEL: 'test5'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test5'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test5'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test5'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2f32.p0(<2 x float>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
+; SSE2-LABEL: 'test6'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test6'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test6'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test6'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  call void @llvm.masked.store.v2i32.p0(<2 x i32>%val, ptr %addr, i32 4, <2 x i1>%mask)
+  ret void
+}
+
+define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
+; SSE2-LABEL: 'test7'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; SSE42-LABEL: 'test7'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX-LABEL: 'test7'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+; AVX512-LABEL: 'test7'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x float> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+  ret <2 x float> %res
+}
+
+define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
+; SSE2-LABEL: 'test8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; SSE42-LABEL: 'test8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX-LABEL: 'test8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+; AVX512-LABEL: 'test8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %res
+;
+  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
+  %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+  ret <2 x i32> %res
+}
+
+define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
+; SSE2-LABEL: 'test_gather_2f64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SSE42-LABEL: 'test_gather_2f64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX1-LABEL: 'test_gather_2f64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX2-LABEL: 'test_gather_2f64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; SKL-LABEL: 'test_gather_2f64'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+; AVX512-LABEL: 'test_gather_2f64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x double> %res
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  ret <2 x double> %res
+}
+
+define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
+; SSE2-LABEL: 'test_gather_4i32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SSE42-LABEL: 'test_gather_4i32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX1-LABEL: 'test_gather_4i32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; AVX2-LABEL: 'test_gather_4i32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKL-LABEL: 'test_gather_4i32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; KNL-LABEL: 'test_gather_4i32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+; SKX-LABEL: 'test_gather_4i32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %res
+;
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+  ret <4 x i32> %res
+}
+
+define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_16f32_const_mask2'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SSE42-LABEL: 'test_gather_16f32_const_mask2'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX1-LABEL: 'test_gather_16f32_const_mask2'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX2-LABEL: 'test_gather_16f32_const_mask2'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; SKL-LABEL: 'test_gather_16f32_const_mask2'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+; AVX512-LABEL: 'test_gather_16f32_const_mask2'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <16 x i32> %ind to <16 x i64>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x float> %res
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  ret <16 x float>%res
+}
+
+define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
+; SSE2-LABEL: 'test_scatter_16i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_16i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX1-LABEL: 'test_scatter_16i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_scatter_16i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKL-LABEL: 'test_scatter_16i32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_16i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %imask = bitcast i16 %mask to <16 x i1>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
+
+  %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
+  %imask = bitcast i16 %mask to <16 x i1>
+  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+  ret void
+}
+
+define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_8i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_8i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_8i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512-LABEL: 'test_scatter_8i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+  ret void
+}
+
+define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_scatter_4i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_scatter_4i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_scatter_4i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; KNL-LABEL: 'test_scatter_4i32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SKX-LABEL: 'test_scatter_4i32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
+; SSE2-LABEL: 'test_gather_4f32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
+; SSE2-LABEL: 'test_gather_4f32_const_mask'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SSE42-LABEL: 'test_gather_4f32_const_mask'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX1-LABEL: 'test_gather_4f32_const_mask'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; AVX2-LABEL: 'test_gather_4f32_const_mask'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKL-LABEL: 'test_gather_4f32_const_mask'
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; KNL-LABEL: 'test_gather_4f32_const_mask'
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; KNL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; KNL-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+; SKX-LABEL: 'test_gather_4f32_const_mask'
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sext_ind = sext <4 x i32> %ind to <4 x i64>
+; SKX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %res
+;
+  %sext_ind = sext <4 x i32> %ind to <4 x i64>
+  %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
+
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.masked.load.v8f64.p0(ptr, i32, <8 x i1>, <8 x double>)
+declare <7 x double> @llvm.masked.load.v7f64.p0(ptr, i32, <7 x i1>, <7 x double>)
+declare <6 x double> @llvm.masked.load.v6f64.p0(ptr, i32, <6 x i1>, <6 x double>)
+declare <5 x double> @llvm.masked.load.v5f64.p0(ptr, i32, <5 x i1>, <5 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0(ptr, i32, <4 x i1>, <4 x double>)
+declare <3 x double> @llvm.masked.load.v3f64.p0(ptr, i32, <3 x i1>, <3 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.load.v1f64.p0(ptr, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.load.v16f32.p0(ptr, i32, <16 x i1>, <16 x float>)
+declare <15 x float> @llvm.masked.load.v15f32.p0(ptr, i32, <15 x i1>, <15 x float>)
+declare <14 x float> @llvm.masked.load.v14f32.p0(ptr, i32, <14 x i1>, <14 x float>)
+declare <13 x float> @llvm.masked.load.v13f32.p0(ptr, i32, <13 x i1>, <13 x float>)
+declare <12 x float> @llvm.masked.load.v12f32.p0(ptr, i32, <12 x i1>, <12 x float>)
+declare <11 x float> @llvm.masked.load.v11f32.p0(ptr, i32, <11 x i1>, <11 x float>)
+declare <10 x float> @llvm.masked.load.v10f32.p0(ptr, i32, <10 x i1>, <10 x float>)
+declare <9 x float> @llvm.masked.load.v9f32.p0(ptr, i32, <9 x i1>, <9 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <7 x float> @llvm.masked.load.v7f32.p0(ptr, i32, <7 x i1>, <7 x float>)
+declare <6 x float> @llvm.masked.load.v6f32.p0(ptr, i32, <6 x i1>, <6 x float>)
+declare <5 x float> @llvm.masked.load.v5f32.p0(ptr, i32, <5 x i1>, <5 x float>)
+declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32, <4 x i1>, <4 x float>)
+declare <3 x float> @llvm.masked.load.v3f32.p0(ptr, i32, <3 x i1>, <3 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0(ptr, i32, <2 x i1>, <2 x float>)
+declare <1 x float> @llvm.masked.load.v1f32.p0(ptr, i32, <1 x i1>, <1 x float>)
+
+declare <8 x i64> @llvm.masked.load.v8i64.p0(ptr, i32, <8 x i1>, <8 x i64>)
+declare <7 x i64> @llvm.masked.load.v7i64.p0(ptr, i32, <7 x i1>, <7 x i64>)
+declare <6 x i64> @llvm.masked.load.v6i64.p0(ptr, i32, <6 x i1>, <6 x i64>)
+declare <5 x i64> @llvm.masked.load.v5i64.p0(ptr, i32, <5 x i1>, <5 x i64>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0(ptr, i32, <4 x i1>, <4 x i64>)
+declare <3 x i64> @llvm.masked.load.v3i64.p0(ptr, i32, <3 x i1>, <3 x i64>)
+declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.load.v1i64.p0(ptr, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0(ptr, i32, <16 x i1>, <16 x i32>)
+declare <15 x i32> @llvm.masked.load.v15i32.p0(ptr, i32, <15 x i1>, <15 x i32>)
+declare <14 x i32> @llvm.masked.load.v14i32.p0(ptr, i32, <14 x i1>, <14 x i32>)
+declare <13 x i32> @llvm.masked.load.v13i32.p0(ptr, i32, <13 x i1>, <13 x i32>)
+declare <12 x i32> @llvm.masked.load.v12i32.p0(ptr, i32, <12 x i1>, <12 x i32>)
+declare <11 x i32> @llvm.masked.load.v11i32.p0(ptr, i32, <11 x i1>, <11 x i32>)
+declare <10 x i32> @llvm.masked.load.v10i32.p0(ptr, i32, <10 x i1>, <10 x i32>)
+declare <9 x i32> @llvm.masked.load.v9i32.p0(ptr, i32, <9 x i1>, <9 x i32>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0(ptr, i32, <8 x i1>, <8 x i32>)
+declare <7 x i32> @llvm.masked.load.v7i32.p0(ptr, i32, <7 x i1>, <7 x i32>)
+declare <6 x i32> @llvm.masked.load.v6i32.p0(ptr, i32, <6 x i1>, <6 x i32>)
+declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr, i32, <5 x i1>, <5 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
+declare <3 x i32> @llvm.masked.load.v3i32.p0(ptr, i32, <3 x i1>, <3 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
+declare <1 x i32> @llvm.masked.load.v1i32.p0(ptr, i32, <1 x i1>, <1 x i32>)
+
+declare <32 x i16> @llvm.masked.load.v32i16.p0(ptr, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.load.v16i16.p0(ptr, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.load.v64i8.p0(ptr, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.load.v32i8.p0(ptr, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f64.p0(<7 x double>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f64.p0(<6 x double>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f64.p0(<5 x double>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f64.p0(<4 x double>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f64.p0(<3 x double>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f64.p0(<2 x double>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f64.p0(<1 x double>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16f32.p0(<16 x float>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15f32.p0(<15 x float>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14f32.p0(<14 x float>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13f32.p0(<13 x float>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12f32.p0(<12 x float>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11f32.p0(<11 x float>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10f32.p0(<10 x float>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9f32.p0(<9 x float>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6f32.p0(<6 x float>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5f32.p0(<5 x float>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3f32.p0(<3 x float>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2f32.p0(<2 x float>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1f32.p0(<1 x float>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v8i64.p0(<8 x i64>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i64.p0(<7 x i64>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i64.p0(<6 x i64>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i64.p0(<5 x i64>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i64.p0(<4 x i64>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i64.p0(<3 x i64>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i64.p0(<1 x i64>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v16i32.p0(<16 x i32>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v15i32.p0(<15 x i32>, ptr, i32, <15 x i1>)
+declare void @llvm.masked.store.v14i32.p0(<14 x i32>, ptr, i32, <14 x i1>)
+declare void @llvm.masked.store.v13i32.p0(<13 x i32>, ptr, i32, <13 x i1>)
+declare void @llvm.masked.store.v12i32.p0(<12 x i32>, ptr, i32, <12 x i1>)
+declare void @llvm.masked.store.v11i32.p0(<11 x i32>, ptr, i32, <11 x i1>)
+declare void @llvm.masked.store.v10i32.p0(<10 x i32>, ptr, i32, <10 x i1>)
+declare void @llvm.masked.store.v9i32.p0(<9 x i32>, ptr, i32, <9 x i1>)
+declare void @llvm.masked.store.v8i32.p0(<8 x i32>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v7i32.p0(<7 x i32>, ptr, i32, <7 x i1>)
+declare void @llvm.masked.store.v6i32.p0(<6 x i32>, ptr, i32, <6 x i1>)
+declare void @llvm.masked.store.v5i32.p0(<5 x i32>, ptr, i32, <5 x i1>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare void @llvm.masked.store.v3i32.p0(<3 x i32>, ptr, i32, <3 x i1>)
+declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
+declare void @llvm.masked.store.v1i32.p0(<1 x i32>, ptr, i32, <1 x i1>)
+
+declare void @llvm.masked.store.v32i16.p0(<32 x i16>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>)
+
+declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
+declare void @llvm.masked.store.v32i8.p0(<32 x i8>, ptr, i32, <32 x i1>)
+declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr>, i32, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.scatter.v8f64.v8p0(<8 x double>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1f64.v1p0(<1 x double>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v8i64.v8p0(<8 x i64>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
+declare void @llvm.masked.scatter.v1i64.v1p0(<1 x i64>, <1 x ptr>, i32, <1 x i1>)
+
+declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
+
+declare void @llvm.masked.scatter.v32i16.v32p0(<32 x i16>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
+
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8>, <64 x ptr>, i32, <64 x i1>)
+declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
+declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
+
+declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.expandload.v4f64(ptr, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.expandload.v2f64(ptr, <2 x i1>, <2 x double>)
+declare <1 x double> @llvm.masked.expandload.v1f64(ptr, <1 x i1>, <1 x double>)
+
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <4 x float> @llvm.masked.expandload.v4f32(ptr, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.expandload.v2f32(ptr, <2 x i1>, <2 x float>)
+
+declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
+declare <4 x i64> @llvm.masked.expandload.v4i64(ptr, <4 x i1>, <4 x i64>)
+declare <2 x i64> @llvm.masked.expandload.v2i64(ptr, <2 x i1>, <2 x i64>)
+declare <1 x i64> @llvm.masked.expandload.v1i64(ptr, <1 x i1>, <1 x i64>)
+
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+declare <8 x i32> @llvm.masked.expandload.v8i32(ptr, <8 x i1>, <8 x i32>)
+declare <4 x i32> @llvm.masked.expandload.v4i32(ptr, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.expandload.v2i32(ptr, <2 x i1>, <2 x i32>)
+
+declare <32 x i16> @llvm.masked.expandload.v32i16(ptr, <32 x i1>, <32 x i16>)
+declare <16 x i16> @llvm.masked.expandload.v16i16(ptr, <16 x i1>, <16 x i16>)
+declare <8 x i16> @llvm.masked.expandload.v8i16(ptr, <8 x i1>, <8 x i16>)
+declare <4 x i16> @llvm.masked.expandload.v4i16(ptr, <4 x i1>, <4 x i16>)
+
+declare <64 x i8> @llvm.masked.expandload.v64i8(ptr, <64 x i1>, <64 x i8>)
+declare <32 x i8> @llvm.masked.expandload.v32i8(ptr, <32 x i1>, <32 x i8>)
+declare <16 x i8> @llvm.masked.expandload.v16i8(ptr, <16 x i1>, <16 x i8>)
+declare <8 x i8> @llvm.masked.expandload.v8i8(ptr, <8 x i1>, <8 x i8>)
+
+declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f64(<4 x double>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f64(<2 x double>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1f64(<1 x double>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8f32(<8 x float>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i64(<4 x i64>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i64(<2 x i64>, ptr, <2 x i1>)
+declare void @llvm.masked.compressstore.v1i64(<1 x i64>, ptr, <1 x i1>)
+
+declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i32(<8 x i32>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i32(<4 x i32>, ptr, <4 x i1>)
+declare void @llvm.masked.compressstore.v2i32(<2 x i32>, ptr, <2 x i1>)
+
+declare void @llvm.masked.compressstore.v32i16(<32 x i16>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i16(<16 x i16>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i16(<8 x i16>, ptr, <8 x i1>)
+declare void @llvm.masked.compressstore.v4i16(<4 x i16>, ptr, <4 x i1>)
+
+declare void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr, <64 x i1>)
+declare void @llvm.masked.compressstore.v32i8(<32 x i8>, ptr, <32 x i1>)
+declare void @llvm.masked.compressstore.v16i8(<16 x i8>, ptr, <16 x i1>)
+declare void @llvm.masked.compressstore.v8i8(<8 x i8>, ptr, <8 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ac3c47c..200e9d1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -395,6 +395,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SADDSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_USUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 499c08f..7921de6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -15,7 +15,7 @@
   define void @mul_wrong_pow_2(ptr %addr) { ret void }
   define void @more_than_one_use_shl_1(ptr %addr) { ret void }
   define void @more_than_one_use_shl_2(ptr %addr) { ret void }
-  define void @more_than_one_use_shl_lsl_fast(ptr %addr) #1 { ret void }
+  define void @more_than_one_use_shl_lsl_fast(ptr %addr) { ret void }
   define void @more_than_one_use_shl_lsl_slow(ptr %addr) { ret void }
   define void @more_than_one_use_shl_minsize(ptr %addr) #0 { ret void }
   define void @ldrwrox(ptr %addr) { ret void }
@@ -24,7 +24,6 @@
   define void @ldbbrox(ptr %addr) { ret void }
   define void @ldrqrox(ptr %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---
@@ -478,11 +477,10 @@ body:             |
     ; CHECK: liveins: $x0, $x1, $x2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK-NEXT: [[ADDXrs:%[0-9]+]]:gpr64common = ADDXrs [[COPY1]], [[COPY]], 3
-    ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui [[ADDXrs]], 0 :: (load (s64) from %ir.addr)
-    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRXui1]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK-NEXT: [[LDRXroX:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[LDRXroX1:%[0-9]+]]:gpr64 = LDRXroX [[COPY1]], [[COPY]], 0, 1 :: (load (s64) from %ir.addr)
+    ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXroX]], [[LDRXroX1]]
     ; CHECK-NEXT: $x2 = COPY [[ADDXrr]]
     ; CHECK-NEXT: RET_ReallyLR implicit $x2
     %0:gpr(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 59cd87f..022aaea 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-slow-14 | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]
@@ -49,36 +49,20 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i32 @word(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: word:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #2
-; CHECK0-NEXT:    ldr w20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov w0, w20
-; CHECK0-NEXT:    str w20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: word:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr w20, [x0, x21, lsl #2]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov w0, w20
-; CHECK3-NEXT:    str w20, [x19, x21, lsl #2]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: word:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr w20, [x0, x21, lsl #2]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov w0, w20
+; CHECK-NEXT:    str w20, [x19, x21, lsl #2]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -90,36 +74,20 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
 }
 
 define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
-; CHECK0-LABEL: doubleword:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-NEXT:    ubfx x8, x1, #9, #8
-; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    lsl x21, x8, #3
-; CHECK0-NEXT:    ldr x20, [x0, x21]
-; CHECK0-NEXT:    bl foo
-; CHECK0-NEXT:    mov x0, x20
-; CHECK0-NEXT:    str x20, [x19, x21]
-; CHECK0-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: doubleword:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK3-NEXT:    ubfx x21, x1, #9, #8
-; CHECK3-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK3-NEXT:    mov x19, x0
-; CHECK3-NEXT:    ldr x20, [x0, x21, lsl #3]
-; CHECK3-NEXT:    bl foo
-; CHECK3-NEXT:    mov x0, x20
-; CHECK3-NEXT:    str x20, [x19, x21, lsl #3]
-; CHECK3-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK3-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: doubleword:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ubfx x21, x1, #9, #8
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    ldr x20, [x0, x21, lsl #3]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    str x20, [x19, x21, lsl #3]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %idxprom83 = and i64 %conv82, 255
@@ -163,20 +131,12 @@ endbb:
 }
 
 define i64 @gep3(ptr %p, i64 %b) {
-; CHECK0-LABEL: gep3:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    lsl x9, x1, #3
-; CHECK0-NEXT:    mov x8, x0
-; CHECK0-NEXT:    ldr x0, [x0, x9]
-; CHECK0-NEXT:    str x1, [x8, x9]
-; CHECK0-NEXT:    ret
-;
-; CHECK3-LABEL: gep3:
-; CHECK3:       // %bb.0:
-; CHECK3-NEXT:    mov x8, x0
-; CHECK3-NEXT:    ldr x0, [x0, x1, lsl #3]
-; CHECK3-NEXT:    str x1, [x8, x1, lsl #3]
-; CHECK3-NEXT:    ret
+; CHECK-LABEL: gep3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldr x0, [x0, x1, lsl #3]
+; CHECK-NEXT:    str x1, [x8, x1, lsl #3]
+; CHECK-NEXT:    ret
   %g = getelementptr inbounds i64, ptr %p, i64 %b
   %l = load i64, ptr %g
   store i64 %b, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
index 573f921..e31c9a0 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll
@@ -134,9 +134,8 @@ define void @test8(i64 %a, ptr noalias %src, ptr noalias %dst, i64 %n) {
 ; CHECK-NEXT:    b.hs .LBB7_1
 ; CHECK-NEXT:  // %bb.3: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; CHECK-NEXT:    lsl x10, x8, #3
-; CHECK-NEXT:    ldr x11, [x1, x10]
-; CHECK-NEXT:    str x11, [x2, x10]
+; CHECK-NEXT:    ldr x10, [x1, x8, lsl #3]
+; CHECK-NEXT:    str x10, [x2, x8, lsl #3]
 ; CHECK-NEXT:    b .LBB7_1
 ; CHECK-NEXT:  .LBB7_4: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index d593272..6bcd2f0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -125,7 +125,7 @@ return:                                           ; preds = %if.end23, %if.then3
 }
 
 ; CHECK: @test
-; CHECK-NOT: , uxtw #2]
+; CHECK: , uxtw #2]
 define i32 @test(ptr %array, i8 zeroext %c, i32 %arg) {
 entry:
   %conv = zext i8 %c to i32
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 3542b26..5b055a4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -201,11 +201,10 @@ define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x1:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray64x1
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x1]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -238,11 +237,10 @@ define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray32x2
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x2]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -275,11 +273,10 @@ define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray16x4
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x4]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -312,11 +309,10 @@ define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:globalArray8x8
-; CHECK-NEXT:    lsl x9, x1, #3
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x8]
-; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
 ; CHECK-NEXT:    ldr x8, [x8]
-; CHECK-NEXT:    str d0, [x8, x9]
+; CHECK-NEXT:    str d0, [x8, x1, lsl #3]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
index 8f19553..634d1b9 100644
--- a/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
@@ -82,13 +82,12 @@ define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
 ; CHECK-NEXT:    eor w10, w10, w11
 ; CHECK-NEXT:    ldur w11, [x8, #-24]
 ; CHECK-NEXT:    and w10, w10, w14
-; CHECK-NEXT:    ldp x15, x14, [x8, #-16]
-; CHECK-NEXT:    ubfiz x13, x10, #1, #32
+; CHECK-NEXT:    ldp x14, x13, [x8, #-16]
 ; CHECK-NEXT:    str w10, [x8]
-; CHECK-NEXT:    and w10, w11, w12
-; CHECK-NEXT:    ldrh w11, [x14, x13]
-; CHECK-NEXT:    strh w11, [x15, w10, uxtw #1]
-; CHECK-NEXT:    strh w12, [x14, x13]
+; CHECK-NEXT:    and w11, w11, w12
+; CHECK-NEXT:    ldrh w15, [x13, w10, uxtw #1]
+; CHECK-NEXT:    strh w15, [x14, w11, uxtw #1]
+; CHECK-NEXT:    strh w12, [x13, w10, uxtw #1]
 ; CHECK-NEXT:    b LBB1_1
 ; CHECK-NEXT:  LBB1_4: ; %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
index b5c2104..50c70c5 100644
--- a/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
+++ b/llvm/test/CodeGen/AArch64/cheap-as-a-move.ll
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux"
 define void @f0(ptr %a, i64 %n) {
 ; CHECK-LABEL: f0:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -15,7 +15,6 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
 ; CHECK-NEXT:    .cfi_offset w30, -48
 ; CHECK-NEXT:    mov x21, #1 // =0x1
 ; CHECK-NEXT:    mov x19, x1
@@ -27,18 +26,17 @@ define void @f0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x23, x22, #2
+; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    mov x1, x21
-; CHECK-NEXT:    ldr w0, [x20, x23]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x23]
+; CHECK-NEXT:    str w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    cmp x22, x19
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %exit
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
@@ -64,15 +62,13 @@ exit:
 define void @f1(ptr %a, i64 %n) {
 ; CHECK-LABEL: f1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    mov x21, xzr
@@ -80,19 +76,17 @@ define void @f1(ptr %a, i64 %n) {
 ; CHECK-NEXT:    b.ge .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // %loop.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    lsl x22, x21, #2
+; CHECK-NEXT:    ldr w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    mov x1, #1450704896 // =0x56780000
 ; CHECK-NEXT:    movk x1, #4660, lsl #48
-; CHECK-NEXT:    ldr w0, [x20, x22]
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    str w0, [x20, x22]
+; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    add x21, x21, #1
 ; CHECK-NEXT:    cmp x21, x19
 ; CHECK-NEXT:    b.lt .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: // %exit
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index d4ea143..b87157a 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -972,10 +972,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x1]
 ; CHECK-NEXT:    ubfx x8, x8, #21, #10
-; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    ldr w9, [x0, x8]
+; CHECK-NEXT:    ldr w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    str w9, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret
   %tmp = load i64, ptr %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index 491bf40..c0f7678 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -903,6 +903,58 @@ define <8 x i16> @shadd_fixedwidth_v8i16(<8 x i16> %a0, <8 x i16> %a1)  {
   ret <8 x i16> %res
 }
 
+define <8 x i16> @shadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: shadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @srhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: srhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @uhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: uhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
+define <8 x i16> @urhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: urhadd_demandedelts:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    urhadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+  %op = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %s0, <8 x i16> %a1)
+  %r0 = shufflevector <8 x i16> %op, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %r0
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
@@ -927,4 +979,4 @@ declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
 declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
+\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a3..e8dafd5 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -223,10 +223,9 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
 ; CHECK-NEXT:    // => This Loop Header: Depth=2
 ; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
-; CHECK-NEXT:    lsl x12, x11, #3
+; CHECK-NEXT:    ldr x13, [x1, x11, lsl #3]
+; CHECK-NEXT:    ldr x12, [x10, x11, lsl #3]
 ; CHECK-NEXT:    mov x14, x4
-; CHECK-NEXT:    ldr x13, [x1, x12]
-; CHECK-NEXT:    ldr x12, [x10, x12]
 ; CHECK-NEXT:    ldr w13, [x13]
 ; CHECK-NEXT:  .LBB3_3: // %for.body8
 ; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
diff --git a/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 0000000..728cffe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,50 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- ok.ll
+
+; RUN: llc -mtriple=aarch64-linux ok.ll               -o - | \
+; RUN:   FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=aarch64-linux ok.ll -filetype=obj -o - |  \
+; RUN:   llvm-readelf --notes - | FileCheck %s --check-prefix=OBJ
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 268435458}
+!1 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 85}
+
+; ASM: .section .note.gnu.property,"a",@note
+; ASM-NEXT: .p2align 3, 0x0
+; ASM-NEXT: .word 4
+; ASM-NEXT: .word 24
+; ASM-NEXT: .word 5
+; ASM-NEXT: .asciz "GNU"
+; 3221225473 = 0xc0000001 = GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+; ASM-NEXT: .word 3221225473
+; ASM-NEXT: .word 16
+; ASM-NEXT: .xword 268435458
+; ASM-NEXT: .xword 85
+
+; OBJ: Displaying notes found in: .note.gnu.property
+; OBJ-NEXT:   Owner                 Data size	Description
+; OBJ-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+; OBJ-NEXT:   AArch64 PAuth ABI core info: platform 0x10000002 (llvm_linux), version 0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)
+
+; ERR: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llc -mtriple=aarch64-linux err1.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llc -mtriple=aarch64-linux err2.ll 2>&1 -o - | \
+; RUN:   FileCheck %s --check-prefix=ERR
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 9e09b7f..789fd7b 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.sadd.sat.i4(i4, i4)
 declare i8 @llvm.sadd.sat.i8(i8, i8)
 declare i16 @llvm.sadd.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 6f1ae02..8a0e766 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +49,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +94,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -135,19 +145,42 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sqadd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -196,23 +229,37 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -230,15 +277,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    sqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqadd v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.sadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -346,23 +405,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -377,23 +450,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 5200722..f65a08a 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -100,7 +100,7 @@ exit:
 }
 
 ; Address calculation cheap enough on some cores.
-define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define i32 @f3(i1 %c1, ptr %p, i64 %i) nounwind  "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
@@ -130,7 +130,7 @@ exit:
   ret i32 %v
 }
 
-define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-lsl-fast" {
+define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: f4:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmp x1, #1
diff --git a/llvm/test/CodeGen/AArch64/sms-regpress.mir b/llvm/test/CodeGen/AArch64/sms-regpress.mir
new file mode 100644
index 0000000..c75eba5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-regpress.mir
@@ -0,0 +1,160 @@
+# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-max-mii=40 -pipeliner-register-pressure -pipeliner-ii-search-range=30 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# REQUIRES: asserts
+
+# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
+# The specific value of II is not important.
+
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Rejected the schedule because of too high register pressure{{$}}
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Schedule Found? 1 (II={{[0-9]+}}){{$}}
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %0 = load double, ptr %a, align 8
+    %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 8
+    %1 = load double, ptr %arrayidx1, align 8
+    %cmp133 = icmp sgt i32 %n, 0
+    br i1 %cmp133, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+  
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add54, %for.body ]
+    ret double %res.0.lcssa
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv137 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv = phi ptr [ %b, %for.body.preheader ], [ %scevgep, %for.body ]
+    %res.0135 = phi double [ 0.000000e+00, %for.body.preheader ], [ %add54, %for.body ]
+    %2 = load double, ptr %lsr.iv, align 8
+    %3 = tail call double @llvm.fmuladd.f64(double %0, double %2, double %0)
+    %4 = tail call double @llvm.fmuladd.f64(double %3, double %2, double %3)
+    %5 = tail call double @llvm.fmuladd.f64(double %4, double %2, double %4)
+    %6 = tail call double @llvm.fmuladd.f64(double %5, double %2, double %5)
+    %7 = tail call double @llvm.fmuladd.f64(double %6, double %2, double %6)
+    %8 = tail call double @llvm.fmuladd.f64(double %7, double %2, double %7)
+    %9 = tail call double @llvm.fmuladd.f64(double %8, double %2, double %8)
+    %10 = tail call double @llvm.fmuladd.f64(double %9, double %2, double %9)
+    %11 = tail call double @llvm.fmuladd.f64(double %10, double %2, double %10)
+    %12 = tail call double @llvm.fmuladd.f64(double %11, double %2, double %11)
+    %13 = tail call double @llvm.fmuladd.f64(double %12, double %2, double %12)
+    %14 = tail call double @llvm.fmuladd.f64(double %13, double %2, double %13)
+    %15 = tail call double @llvm.fmuladd.f64(double %14, double %2, double %14)
+    %16 = tail call double @llvm.fmuladd.f64(double %15, double %2, double %15)
+    %17 = tail call double @llvm.fmuladd.f64(double %16, double %2, double %16)
+    %18 = tail call double @llvm.fmuladd.f64(double %17, double %2, double %17)
+    %add = fadd double %17, %18
+    %19 = tail call double @llvm.fmuladd.f64(double %18, double %2, double %add)
+    %add35 = fadd double %10, %19
+    %20 = tail call double @llvm.fmuladd.f64(double %3, double %2, double %add35)
+    %add38 = fadd double %11, %20
+    %21 = tail call double @llvm.fmuladd.f64(double %4, double %2, double %add38)
+    %add41 = fadd double %12, %21
+    %22 = tail call double @llvm.fmuladd.f64(double %5, double %2, double %add41)
+    %add44 = fadd double %14, %15
+    %add45 = fadd double %13, %add44
+    %add46 = fadd double %add45, %22
+    %23 = tail call double @llvm.fmuladd.f64(double %6, double %2, double %add46)
+    %mul = fmul double %2, %7
+    %mul51 = fmul double %1, %mul
+    %24 = tail call double @llvm.fmuladd.f64(double %mul51, double %9, double %23)
+    %25 = tail call double @llvm.fmuladd.f64(double %8, double %1, double %24)
+    %add54 = fadd double %res.0135, %25
+    %scevgep = getelementptr i8, ptr %lsr.iv, i64 8
+    %lsr.iv.next = add nsw i64 %lsr.iv137, -1
+    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  declare double @llvm.fmuladd.f64(double, double, double)
+
+...
+---
+name:            kernel
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '%10' }
+  - { reg: '$x1', virtual-reg: '%11' }
+  - { reg: '$w2', virtual-reg: '%12' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.4
+    liveins: $x0, $x1, $w2
+  
+    %12:gpr32common = COPY $w2
+    %11:gpr64 = COPY $x1
+    %10:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %12, 1, 0, implicit-def $nzcv
+    Bcc 10, %bb.1, implicit $nzcv
+  
+  bb.4:
+    %13:fpr64 = FMOVD0
+    B %bb.2
+  
+  bb.1.for.body.preheader:
+    %0:fpr64 = LDRDui %10, 0 :: (load (s64) from %ir.a)
+    %1:fpr64 = LDRDui %10, 1 :: (load (s64) from %ir.arrayidx1)
+    %16:gpr32 = ORRWrs $wzr, %12, 0
+    %2:gpr64all = SUBREG_TO_REG 0, killed %16, %subreg.sub_32
+    %15:fpr64 = FMOVD0
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    %3:fpr64 = PHI %13, %bb.4, %7, %bb.3
+    $d0 = COPY %3
+    RET_ReallyLR implicit $d0
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %4:gpr64sp = PHI %2, %bb.1, %9, %bb.3
+    %5:gpr64sp = PHI %11, %bb.1, %8, %bb.3
+    %6:fpr64 = PHI %15, %bb.1, %7, %bb.3
+    early-clobber %17:gpr64sp, %18:fpr64 = LDRDpost %5, 8 :: (load (s64) from %ir.lsr.iv)
+    %19:fpr64 = nofpexcept FMADDDrrr %0, %18, %0, implicit $fpcr
+    %20:fpr64 = nofpexcept FMADDDrrr %19, %18, %19, implicit $fpcr
+    %21:fpr64 = nofpexcept FMADDDrrr %20, %18, %20, implicit $fpcr
+    %22:fpr64 = nofpexcept FMADDDrrr %21, %18, %21, implicit $fpcr
+    %23:fpr64 = nofpexcept FMADDDrrr %22, %18, %22, implicit $fpcr
+    %24:fpr64 = nofpexcept FMADDDrrr %23, %18, %23, implicit $fpcr
+    %25:fpr64 = nofpexcept FMADDDrrr %24, %18, %24, implicit $fpcr
+    %26:fpr64 = nofpexcept FMADDDrrr %25, %18, %25, implicit $fpcr
+    %27:fpr64 = nofpexcept FMADDDrrr %26, %18, %26, implicit $fpcr
+    %28:fpr64 = nofpexcept FMADDDrrr %27, %18, %27, implicit $fpcr
+    %29:fpr64 = nofpexcept FMADDDrrr %28, %18, %28, implicit $fpcr
+    %30:fpr64 = nofpexcept FMADDDrrr %29, %18, %29, implicit $fpcr
+    %31:fpr64 = nofpexcept FMADDDrrr %30, %18, %30, implicit $fpcr
+    %32:fpr64 = nofpexcept FMADDDrrr %31, %18, %31, implicit $fpcr
+    %33:fpr64 = nofpexcept FMADDDrrr %32, %18, %32, implicit $fpcr
+    %34:fpr64 = nofpexcept FMADDDrrr %33, %18, %33, implicit $fpcr
+    %35:fpr64 = nofpexcept FADDDrr %33, %34, implicit $fpcr
+    %36:fpr64 = nofpexcept FMADDDrrr %34, %18, killed %35, implicit $fpcr
+    %37:fpr64 = nofpexcept FADDDrr %26, killed %36, implicit $fpcr
+    %38:fpr64 = nofpexcept FMADDDrrr %19, %18, killed %37, implicit $fpcr
+    %39:fpr64 = nofpexcept FADDDrr %27, killed %38, implicit $fpcr
+    %40:fpr64 = nofpexcept FMADDDrrr %20, %18, killed %39, implicit $fpcr
+    %41:fpr64 = nofpexcept FADDDrr %28, killed %40, implicit $fpcr
+    %42:fpr64 = nofpexcept FMADDDrrr %21, %18, killed %41, implicit $fpcr
+    %43:fpr64 = nofpexcept FADDDrr %30, %31, implicit $fpcr
+    %44:fpr64 = nofpexcept FADDDrr %29, killed %43, implicit $fpcr
+    %45:fpr64 = nofpexcept FADDDrr killed %44, killed %42, implicit $fpcr
+    %46:fpr64 = nofpexcept FMADDDrrr %22, %18, killed %45, implicit $fpcr
+    %47:fpr64 = nofpexcept FMULDrr %18, %23, implicit $fpcr
+    %48:fpr64 = nofpexcept FMULDrr %1, killed %47, implicit $fpcr
+    %49:fpr64 = nofpexcept FMADDDrrr killed %48, %25, killed %46, implicit $fpcr
+    %50:fpr64 = nofpexcept FMADDDrrr %24, %1, killed %49, implicit $fpcr
+    %7:fpr64 = nofpexcept FADDDrr %6, killed %50, implicit $fpcr
+    %8:gpr64all = COPY %17
+    %51:gpr64 = nsw SUBSXri %4, 1, 0, implicit-def $nzcv
+    %9:gpr64all = COPY %51
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index abeb4b3..4d755f4 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -2,8 +2,6 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for vec
-
 declare i4 @llvm.ssub.sat.i4(i4, i4)
 declare i8 @llvm.ssub.sat.i8(i8, i8)
 declare i16 @llvm.ssub.sat.i16(i16, i16)
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d1f843a..a8c1276 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    sqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    sqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    sqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    sqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    sqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    sqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    sqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    sqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    sqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    sqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -136,19 +146,42 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    sqsub v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -197,23 +230,37 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-NEXT:    add x8, x0, #2
-; CHECK-NEXT:    add x9, x1, #2
-; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    add x9, x1, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
+; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -231,15 +278,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    sqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    sqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    sqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    sqsub v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.ssub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -349,23 +408,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    sqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    sqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    sqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -380,23 +453,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    sqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    sqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    sqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    sqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    sqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
new file mode 100644
index 0000000..bcfc7b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <4 x i32> @masked_load_v4i32(ptr %a, <4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.masked.load.v4i32(ptr %a, i32 1, <4 x i1> %mask, <4 x i32> undef), !nontemporal !0
+  ret <4 x i32> %load
+}
+
+define void @masked_store_v4i32(<4 x i32> %x, ptr %a, <4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> %x, ptr %a, i32 1, <4 x i1> %mask), !nontemporal !0
+  ret void
+}
+
+define <4 x i32> @load_v4i32(ptr %a) nounwind {
+; CHECK-LABEL: load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.masked.load.v4i32(ptr %a, i32 1, <4 x i1> <i1 1, i1 1, i1 1, i1 1>, <4 x i32> undef), !nontemporal !0
+  ret <4 x i32> %load
+}
+
+define void @store_v4i32(<4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.v4i32.p0(<4 x i32> %x, ptr %a, i32 1, <4 x i1> <i1 1, i1 1, i1 1, i1 1>), !nontemporal !0
+  ret void
+}
+
+define <vscale x 4 x i32> @masked_load_nxv4i32(ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_load_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef), !nontemporal !0
+  ret <vscale x 4 x i32> %load
+}
+
+define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> %mask), !nontemporal !0
+  ret void
+}
+
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
+declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index f0bbed5..30ff700 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -67,23 +49,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqadd v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqadd v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqadd v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqadd v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqadd v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -98,23 +94,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqadd v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqadd v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqadd v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqadd v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqadd v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -135,16 +145,39 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ldr s2, [x1]
-; CHECK-NEXT:    movi d0, #0xff00ff00ff00ff
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v2.8b
-; CHECK-NEXT:    umin v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s1, [x0]
+; CHECK-SD-NEXT:    ldr s2, [x1]
+; CHECK-SD-NEXT:    movi d0, #0xff00ff00ff00ff
+; CHECK-SD-NEXT:    uaddl v1.8h, v1.8b, v2.8b
+; CHECK-SD-NEXT:    umin v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -194,24 +227,38 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -229,15 +276,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqadd v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqadd v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqadd v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqadd v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    uqadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqadd v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.uadd.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -336,23 +395,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqadd v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqadd v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqadd v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqadd v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqadd v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -367,23 +440,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqadd v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqadd v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqadd v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqadd v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqadd v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqadd v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 82c0327..3bc2796 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -2,28 +2,10 @@
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc < %s -mtriple=aarch64-- -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for v16i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v64i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v32i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i16
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i16
+; CHECK-GI:       warning: Instruction selection used fallback path for v2i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i8
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v12i16
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i4
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i1
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v16i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v4i64
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v8i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v2i128
 
 declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>)
@@ -68,23 +50,37 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 }
 
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
-; CHECK-LABEL: v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
 }
 
 define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
-; CHECK-LABEL: v64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uqsub v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    uqsub v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v64i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-SD-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-SD-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v64i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    uqsub v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    uqsub v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    uqsub v3.16b, v3.16b, v7.16b
+; CHECK-GI-NEXT:    ret
   %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y)
   ret <64 x i8> %z
 }
@@ -99,23 +95,37 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 }
 
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
-; CHECK-LABEL: v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
 }
 
 define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
-; CHECK-LABEL: v32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.8h, v2.8h, v6.8h
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v4.8h
-; CHECK-NEXT:    uqsub v1.8h, v1.8h, v5.8h
-; CHECK-NEXT:    uqsub v3.8h, v3.8h, v7.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v32i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-SD-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-SD-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v32i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v4.8h
+; CHECK-GI-NEXT:    uqsub v1.8h, v1.8h, v5.8h
+; CHECK-GI-NEXT:    uqsub v2.8h, v2.8h, v6.8h
+; CHECK-GI-NEXT:    uqsub v3.8h, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ret
   %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y)
   ret <32 x i16> %z
 }
@@ -136,16 +146,39 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v4i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    str s0, [x2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uqsub v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    str s0, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v1.b[1]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v0.b[3]
+; CHECK-GI-NEXT:    mov b6, v1.b[3]
+; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v1.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    str w8, [x2]
+; CHECK-GI-NEXT:    ret
   %x = load <4 x i8>, ptr %px
   %y = load <4 x i8>, ptr %py
   %z = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %x, <4 x i8> %y)
@@ -193,22 +226,36 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 }
 
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v2i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #2]
-; CHECK-NEXT:    ldrh w11, [x1, #2]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w9, [x2]
-; CHECK-NEXT:    strh w8, [x2, #2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldrh w8, [x0]
+; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
+; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
+; CHECK-SD-NEXT:    fmov s0, w8
+; CHECK-SD-NEXT:    fmov s1, w9
+; CHECK-SD-NEXT:    mov v0.s[1], w10
+; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x2]
+; CHECK-SD-NEXT:    strh w8, [x2, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h2, [x1]
+; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    str h0, [x2]
+; CHECK-GI-NEXT:    str h1, [x2, #2]
+; CHECK-GI-NEXT:    ret
   %x = load <2 x i16>, ptr %px
   %y = load <2 x i16>, ptr %py
   %z = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %x, <2 x i16> %y)
@@ -226,15 +273,27 @@ define <12 x i8> @v12i8(<12 x i8> %x, <12 x i8> %y) nounwind {
 }
 
 define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
-; CHECK-LABEL: v12i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uqsub v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uqsub v1.8h, v2.8h, v3.8h
-; CHECK-NEXT:    str q0, [x2]
-; CHECK-NEXT:    str d1, [x2, #16]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v12i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldp q0, q3, [x1]
+; CHECK-SD-NEXT:    ldp q1, q2, [x0]
+; CHECK-SD-NEXT:    uqsub v0.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    uqsub v1.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    str q0, [x2]
+; CHECK-SD-NEXT:    str d1, [x2, #16]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v12i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ldr d2, [x0, #16]
+; CHECK-GI-NEXT:    ldr d3, [x1, #16]
+; CHECK-GI-NEXT:    uqsub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    uqsub v1.4h, v2.4h, v3.4h
+; CHECK-GI-NEXT:    str q0, [x2]
+; CHECK-GI-NEXT:    str d1, [x2, #16]
+; CHECK-GI-NEXT:    ret
   %x = load <12 x i16>, ptr %px
   %y = load <12 x i16>, ptr %py
   %z = call <12 x i16> @llvm.usub.sat.v12i16(<12 x i16> %x, <12 x i16> %y)
@@ -334,23 +393,37 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 }
 
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
 }
 
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
-; CHECK-LABEL: v16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    uqsub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    uqsub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v16i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-SD-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-SD-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v16i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    uqsub v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    uqsub v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    uqsub v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ret
   %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
   ret <16 x i32> %z
 }
@@ -365,23 +438,37 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 }
 
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
-; CHECK-LABEL: v4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
   %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
 }
 
 define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
-; CHECK-LABEL: v8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v2.2d, v2.2d, v6.2d
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v4.2d
-; CHECK-NEXT:    uqsub v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    uqsub v3.2d, v3.2d, v7.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-SD-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-SD-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-SD-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uqsub v0.2d, v0.2d, v4.2d
+; CHECK-GI-NEXT:    uqsub v1.2d, v1.2d, v5.2d
+; CHECK-GI-NEXT:    uqsub v2.2d, v2.2d, v6.2d
+; CHECK-GI-NEXT:    uqsub v3.2d, v3.2d, v7.2d
+; CHECK-GI-NEXT:    ret
   %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
   ret <8 x i64> %z
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index c25b0f2..78d9084 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
index 303dc46..5c22d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
index 63702d2..e005c38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB1_2: ; %bb1
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
   %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   br i1 %val, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 352adac..af6f6913 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -39,9 +39,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB0_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB0_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -253,8 +253,8 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -504,11 +504,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -544,11 +544,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -944,7 +944,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -952,6 +951,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -974,7 +974,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1006,7 +1005,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -1219,11 +1217,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB5_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
@@ -1258,11 +1256,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB5_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1530,10 +1528,10 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -1557,12 +1555,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB7_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1585,12 +1583,12 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1751,8 +1749,8 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -2006,11 +2004,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB9_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2046,11 +2044,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB9_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2446,7 +2444,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
@@ -2454,6 +2451,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -2477,7 +2475,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB11_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2487,6 +2484,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2509,7 +2507,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB11_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
@@ -2519,6 +2516,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3081,11 +3079,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB14_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3121,11 +3119,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3355,11 +3353,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB15_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3395,11 +3393,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB15_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3629,11 +3627,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB16_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3669,11 +3667,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB16_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3903,11 +3901,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB17_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -3943,11 +3941,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB17_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4151,7 +4149,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -4162,6 +4159,7 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4182,7 +4180,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB18_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4216,7 +4213,6 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB18_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4419,11 +4415,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB19_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4459,11 +4455,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -4667,7 +4663,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_bfrev_b32_e32 v1, -2
@@ -4678,6 +4673,7 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -4698,7 +4694,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  .LBB20_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4732,7 +4727,6 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB20_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, -2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
@@ -4935,11 +4929,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB21_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4975,11 +4969,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB21_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5183,7 +5177,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
@@ -5193,6 +5186,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5214,7 +5208,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB22_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -5226,6 +5219,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5246,7 +5240,6 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB22_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -5258,6 +5251,7 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5446,11 +5440,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB23_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5486,11 +5480,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB23_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -5694,7 +5688,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5704,6 +5697,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -5725,7 +5719,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:  .LBB24_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5737,6 +5730,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5757,7 +5751,6 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:  .LBB24_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
@@ -5769,6 +5762,7 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 19a1d2d9..c9076a9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -186,7 +186,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:  .LBB1_8: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9865883..bf4302c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -5678,22 +5678,18 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_store_b128 v0, v[22:25], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[18:21], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[14:17], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[10:13], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[2:5], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
+; GFX11-NEXT:    scratch_store_b16 v0, v1, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
   %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
@@ -8827,19 +8823,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
 ; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
 ; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(30)
@@ -8936,23 +8919,23 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    scratch_store_b128 v0, v[96:99], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[84:87], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[80:83], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[68:71], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[64:67], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
   %fpext = fpext <32 x bfloat> %load to <32 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index ac50fb8..da609bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:  .LBB0_2: ; %endif
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3d0000
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v1, v0, s[0:1] offset:2300
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 069c57e..6dabd8c 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -103,7 +103,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB0_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -131,7 +130,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB0_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -266,7 +264,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB1_4: ; %exit
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_or_b32_e32 v1, 0xffff8000, v0
@@ -294,7 +291,6 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB1_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -431,7 +427,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:  .LBB2_4: ; %exit
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -461,7 +456,6 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB2_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -665,7 +659,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB3_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -871,7 +864,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB4_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1081,7 +1073,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB5_4: ; %exit
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -1432,7 +1423,6 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
@@ -1724,7 +1714,6 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
 ; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index db89ad6..3b2f15c 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -114,7 +114,6 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:  .LBB3_2: ; %bb2
 ; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index acadee2..401cbce 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
 ; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c1d6826..3b078c4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,138 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2636,7 +2518,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2532,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT:    s_clause 0x14
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x11
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    s_clause 0xd
-; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[60:63], off offset:272
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[12:15], off offset:256
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[16:19], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[20:23], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    scratch_store_b128 v0, v[56:59], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    scratch_store_b128 v0, v[41:44], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    scratch_store_b128 v0, v[37:40], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
@@ -3306,7 +3176,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s46, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3353,11 +3223,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x200
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
@@ -3373,14 +3243,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s45, return_72xi32@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s44, return_72xi32@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
 ; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_add_i32 s2, s32, 0xa0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v48
 ; GFX11-NEXT:    s_clause 0x9
@@ -3431,38 +3301,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v9, v20
-; GFX11-NEXT:    scratch_store_b32 off, v11, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
+; GFX11-NEXT:    scratch_store_b32 off, v11, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x90
 ; GFX11-NEXT:    v_mov_b32_e32 v11, v22
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x80
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x70
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v17
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s2
 ; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x6c
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v18
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x60
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
-; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
+; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v14, v25
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 48
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s2
+; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
@@ -3470,10 +3340,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
@@ -3493,7 +3363,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s46
+; GFX11-NEXT:    s_mov_b32 s33, s34
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 433a836..3b3e107 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -33,7 +33,7 @@ define void @func_use_lds_global() {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -103,7 +103,7 @@ define void @func_use_lds_global_constexpr_cast() {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_trap 2
@@ -171,7 +171,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %bb1
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,7 +181,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX8-SDAG-NEXT:  ; %bb.3: ; %bb0
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,7 +189,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:  .LBB2_4: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -379,7 +379,7 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
-; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[4:5], 0xc8
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -472,7 +472,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
-; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-SDAG-NEXT:    s_mov_b64 s[6:7], 0xc8
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -481,7 +481,6 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX8-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
@@ -506,7 +505,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX8-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
@@ -527,7 +526,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB4_2: ; %ret
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
@@ -548,7 +547,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB4_2: ; %ret
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
@@ -570,7 +569,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:  .LBB4_3: ; %ret
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB4_4:
 ; SDAG-NEXT:    s_endpgm
@@ -594,7 +593,7 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:  .LBB4_3: ; %ret
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB4_4:
 ; GISEL-NEXT:    s_endpgm
@@ -616,6 +615,3 @@ ret:
 ; CHECK: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 5e76dfd..4477f02 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -157,7 +157,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  .LBB2_2:
 ; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; VI-NEXT:    s_mov_b64 s[6:7], exec
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; VI-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -203,15 +202,14 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:  ; %bb.7:
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:  .LBB2_8:
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_readfirstlane_b32 s2, v2
 ; VI-NEXT:    v_add_f32_e32 v2, s2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -240,7 +238,6 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:  .LBB2_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
@@ -285,16 +282,15 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_8
 ; GFX9-NEXT:  ; %bb.7:
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  .LBB2_8:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 138dd53..d19ef75 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1260,8 +1260,6 @@ define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB11_5: ; %end
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB11_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
@@ -1525,8 +1523,6 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB13_5: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ; GFX11-NEXT:  .LBB13_6:
 ; GFX11-NEXT:    s_mov_b64 exec, 0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index eef5f57..ecebbb9 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -32,7 +32,7 @@ define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
 entry:
@@ -79,7 +79,7 @@ define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a,
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT:   S_WAITCNT 3952
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
 entry:
diff --git a/llvm/test/CodeGen/Generic/allow-check.ll b/llvm/test/CodeGen/Generic/allow-check.ll
index 43dab68..a084889 100644
--- a/llvm/test/CodeGen/Generic/allow-check.ll
+++ b/llvm/test/CodeGen/Generic/allow-check.ll
@@ -2,6 +2,7 @@
 ; REQUIRES: host-byteorder-little-endian
 
 ; -global-isel=1 is unsupported.
+; XFAIL: target=loongarch{{.*}}
 ; XFAIL: target=nvptx{{.*}}
 ; XFAIL: target=sparc{{.*}}
 ; XFAIL: target=hexagon-{{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
index b7f8b8a..8980049 100644
--- a/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
+++ b/llvm/test/CodeGen/PowerPC/legalize-vaarg.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=BE
 ;RUN: llc < %s --mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=LE
+;RUN: llc < %s --mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -ppc-gather-alias-max-depth=0 | FileCheck %s -check-prefix=FORWARD
 
 define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; BE-LABEL: test_large_vec_vaarg:
@@ -35,6 +36,22 @@ define <8 x i32> @test_large_vec_vaarg(i32 %n, ...) {
 ; LE-NEXT:    lxvd2x 0, 0, 3
 ; LE-NEXT:    xxswapd 35, 0
 ; LE-NEXT:    blr
+;
+; FORWARD-LABEL: test_large_vec_vaarg:
+; FORWARD:       # %bb.0:
+; FORWARD-NEXT:    ld 3, -8(1)
+; FORWARD-NEXT:    addi 3, 3, 15
+; FORWARD-NEXT:    rldicr 3, 3, 0, 59
+; FORWARD-NEXT:    addi 4, 3, 16
+; FORWARD-NEXT:    std 4, -8(1)
+; FORWARD-NEXT:    ld 4, -8(1)
+; FORWARD-NEXT:    lvx 2, 0, 3
+; FORWARD-NEXT:    addi 4, 4, 15
+; FORWARD-NEXT:    rldicr 3, 4, 0, 59
+; FORWARD-NEXT:    addi 4, 3, 16
+; FORWARD-NEXT:    std 4, -8(1)
+; FORWARD-NEXT:    lvx 3, 0, 3
+; FORWARD-NEXT:    blr
   %args = alloca ptr, align 4
   %x = va_arg ptr %args, <8 x i32>
   ret <8 x i32> %x
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
index cebd78a..b01115c 100644
--- a/llvm/test/CodeGen/PowerPC/sms-regpress.mir
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -1,41 +1,30 @@
-# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner  -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
+# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
 
 # REQUIRES: asserts
 
 # Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
 # The specific value of II is not important.
 
-# CHECK: Try to schedule with 21
-# CHECK: 	Can't schedule
-# CHECK: Try to schedule with 22
-# CHECK: 	Can't schedule
-# CHECK: Try to schedule with 23
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 24
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 25
-# CHECK: Rejected the schedule because of too high register pressure
-# CHECK: Try to schedule with 26
-# CHECK: Schedule Found? 1 (II=26)
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Rejected the schedule because of too high register pressure{{$}}
+# CHECK: {{^ *}}Try to schedule with {{[0-9]+$}}
+# CHECK: {{^ *}}Schedule Found? 1 (II={{[0-9]+}}){{$}}
 
 --- |
-  ; ModuleID = 'a.ll'
-  source_filename = "a.c"
   target datalayout = "e-m:e-Fn32-i64:64-n32:64"
   target triple = "ppc64le"
   
-  ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable
-  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 {
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr {
   entry:
-    %0 = load double, ptr %a, align 8, !tbaa !3
-    %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1
-    %1 = load double, ptr %arrayidx1, align 8, !tbaa !3
+    %0 = load double, ptr %a, align 8
+    %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 8
+    %1 = load double, ptr %arrayidx1, align 8
     %cmp163 = icmp sgt i32 %n, 0
     br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup
   
   for.body.preheader:                               ; preds = %entry
-    %wide.trip.count = zext i32 %n to i64
-    %scevgep1 = getelementptr i8, ptr %b, i64 -8
+    %wide.trip.count = zext nneg i32 %n to i64
+    %scevgep167 = getelementptr i8, ptr %b, i64 -8
     call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count)
     br label %for.body
   
@@ -43,11 +32,11 @@
     %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ]
     ret double %res.0.lcssa
   
-  for.body:                                         ; preds = %for.body, %for.body.preheader
+  for.body:                                         ; preds = %for.body.preheader, %for.body
     %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ]
-    %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ]
+    %2 = phi ptr [ %scevgep167, %for.body.preheader ], [ %3, %for.body ]
     %3 = getelementptr i8, ptr %2, i64 8
-    %4 = load double, ptr %3, align 8, !tbaa !3
+    %4 = load double, ptr %3, align 8
     %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0)
     %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5)
     %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6)
@@ -92,152 +81,23 @@
     %mul66 = fmul double %12, %mul65
     %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165)
     %31 = call i1 @llvm.loop.decrement.i64(i64 1)
-    br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7
+    br i1 %31, label %for.body, label %for.cond.cleanup
   }
   
-  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-  declare double @llvm.fmuladd.f64(double, double, double) #1
+  declare double @llvm.fmuladd.f64(double, double, double)
   
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare void @llvm.set.loop.iterations.i64(i64) #2
+  declare void @llvm.set.loop.iterations.i64(i64)
   
-  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
-  declare i1 @llvm.loop.decrement.i64(i64) #2
+  declare i1 @llvm.loop.decrement.i64(i64)
   
-  attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" }
-  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-  attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{i32 7, !"uwtable", i32 2}
-  !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"}
-  !3 = !{!4, !4, i64 0}
-  !4 = !{!"double", !5, i64 0}
-  !5 = !{!"omnipotent char", !6, i64 0}
-  !6 = !{!"Simple C/C++ TBAA"}
-  !7 = distinct !{!7, !8, !9}
-  !8 = !{!"llvm.loop.mustprogress"}
-  !9 = !{!"llvm.loop.unroll.disable"}
-
 ...
 ---
 name:            kernel
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
 tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: vsfrc, preferred-register: '' }
-  - { id: 1, class: vsfrc, preferred-register: '' }
-  - { id: 2, class: g8rc, preferred-register: '' }
-  - { id: 3, class: vsfrc, preferred-register: '' }
-  - { id: 4, class: vsfrc, preferred-register: '' }
-  - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 6, class: g8rc, preferred-register: '' }
-  - { id: 7, class: vsfrc, preferred-register: '' }
-  - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 10, class: g8rc, preferred-register: '' }
-  - { id: 11, class: gprc, preferred-register: '' }
-  - { id: 12, class: vsfrc, preferred-register: '' }
-  - { id: 13, class: crrc, preferred-register: '' }
-  - { id: 14, class: vsfrc, preferred-register: '' }
-  - { id: 15, class: g8rc, preferred-register: '' }
-  - { id: 16, class: g8rc, preferred-register: '' }
-  - { id: 17, class: g8rc, preferred-register: '' }
-  - { id: 18, class: f8rc, preferred-register: '' }
-  - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' }
-  - { id: 20, class: vsfrc, preferred-register: '' }
-  - { id: 21, class: vsfrc, preferred-register: '' }
-  - { id: 22, class: vsfrc, preferred-register: '' }
-  - { id: 23, class: vsfrc, preferred-register: '' }
-  - { id: 24, class: vsfrc, preferred-register: '' }
-  - { id: 25, class: vsfrc, preferred-register: '' }
-  - { id: 26, class: vsfrc, preferred-register: '' }
-  - { id: 27, class: vsfrc, preferred-register: '' }
-  - { id: 28, class: vsfrc, preferred-register: '' }
-  - { id: 29, class: vsfrc, preferred-register: '' }
-  - { id: 30, class: vsfrc, preferred-register: '' }
-  - { id: 31, class: vsfrc, preferred-register: '' }
-  - { id: 32, class: vsfrc, preferred-register: '' }
-  - { id: 33, class: vsfrc, preferred-register: '' }
-  - { id: 34, class: vsfrc, preferred-register: '' }
-  - { id: 35, class: vsfrc, preferred-register: '' }
-  - { id: 36, class: vsfrc, preferred-register: '' }
-  - { id: 37, class: vsfrc, preferred-register: '' }
-  - { id: 38, class: vsfrc, preferred-register: '' }
-  - { id: 39, class: vsfrc, preferred-register: '' }
-  - { id: 40, class: vsfrc, preferred-register: '' }
-  - { id: 41, class: vsfrc, preferred-register: '' }
-  - { id: 42, class: vsfrc, preferred-register: '' }
-  - { id: 43, class: vsfrc, preferred-register: '' }
-  - { id: 44, class: vsfrc, preferred-register: '' }
-  - { id: 45, class: vsfrc, preferred-register: '' }
-  - { id: 46, class: vsfrc, preferred-register: '' }
-  - { id: 47, class: vsfrc, preferred-register: '' }
-  - { id: 48, class: vsfrc, preferred-register: '' }
-  - { id: 49, class: vsfrc, preferred-register: '' }
-  - { id: 50, class: vsfrc, preferred-register: '' }
-  - { id: 51, class: vsfrc, preferred-register: '' }
-  - { id: 52, class: vsfrc, preferred-register: '' }
-  - { id: 53, class: vsfrc, preferred-register: '' }
-  - { id: 54, class: vsfrc, preferred-register: '' }
-  - { id: 55, class: vsfrc, preferred-register: '' }
-  - { id: 56, class: vsfrc, preferred-register: '' }
-  - { id: 57, class: vsfrc, preferred-register: '' }
-  - { id: 58, class: vsfrc, preferred-register: '' }
-  - { id: 59, class: vsfrc, preferred-register: '' }
-  - { id: 60, class: vsfrc, preferred-register: '' }
-  - { id: 61, class: vsfrc, preferred-register: '' }
-  - { id: 62, class: crbitrc, preferred-register: '' }
 liveins:
   - { reg: '$x3', virtual-reg: '%8' }
   - { reg: '$x4', virtual-reg: '%9' }
   - { reg: '$x5', virtual-reg: '%10' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     successors: %bb.2(0x50000000), %bb.1(0x30000000)
@@ -251,16 +111,12 @@ body:             |
     BCC 44, killed %13, %bb.2
   
   bb.1:
-    successors: %bb.3(0x80000000)
-  
     %12:vsfrc = XXLXORdpz
     B %bb.3
   
   bb.2.for.body.preheader:
-    successors: %bb.4(0x80000000)
-  
-    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3)
-    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3)
+    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a)
+    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1)
     %16:g8rc = IMPLICIT_DEF
     %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32
     %17:g8rc = RLDICL killed %15, 0, 32
@@ -279,7 +135,7 @@ body:             |
   
     %4:vsfrc = PHI %14, %bb.2, %7, %bb.4
     %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4
-    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3)
+    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3)
     %6:g8rc = COPY killed %19
     %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm
     %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir
new file mode 100644
index 0000000..eda1180
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/anyext.mir
@@ -0,0 +1,902 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir \
+# RUN:   -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir \
+# RUN:   -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_ANYEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir
new file mode 100644
index 0000000..df0d48a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/icmp.mir
@@ -0,0 +1,534 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+# Don't test i1 element types here since they have been widened to i8 in legalization
+
+---
+name:            icmp_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLTU_VV_MF8_:%[0-9]+]]:vr = PseudoVMSLTU_VV_MF8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_MF8_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLTU_VV_MF8_:%[0-9]+]]:vr = PseudoVMSLTU_VV_MF8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_MF8_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(ult), %0(<vscale x 1 x s8>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLT_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLT_VV_MF4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLT_VV_MF4_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLT_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLT_VV_MF4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLT_VV_MF4_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(slt), %0(<vscale x 2 x s8>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLEU_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLEU_VV_MF2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLEU_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLEU_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLEU_VV_MF2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLEU_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(uge), %0(<vscale x 4 x s8>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i8
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_M1_:%[0-9]+]]:vr = PseudoVMSLE_VV_M1 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i8
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_M1_:%[0-9]+]]:vr = PseudoVMSLE_VV_M1 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sge), %0(<vscale x 8 x s8>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(ugt), %0(<vscale x 16 x s8>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s8>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 3 /* e8 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 64 x s1>) = G_ICMP intpred(ule), %0(<vscale x 64 x s8>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF4_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_MF4_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF4_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sle), %0(<vscale x 1 x s16>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSNE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSNE_VV_MF2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSNE_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSNE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSNE_VV_MF2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSNE_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ne), %0(<vscale x 2 x s16>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i16
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i16
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(eq), %0(<vscale x 4 x s16>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M2 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ult), %0(<vscale x 8 x s16>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M4 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(slt), %0(<vscale x 16 x s16>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M8 [[DEF]], [[DEF]], -1, 4 /* e16 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 32 x s1>) = G_ICMP intpred(uge), %0(<vscale x 32 x s16>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i32
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF2_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i32
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLE_VV_MF2_:%[0-9]+]]:vr = PseudoVMSLE_VV_MF2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLE_VV_MF2_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sge), %0(<vscale x 1 x s32>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i32
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSLTU_VV_M1_:%[0-9]+]]:vr = PseudoVMSLTU_VV_M1 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i32
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSLTU_VV_M1_:%[0-9]+]]:vr = PseudoVMSLTU_VV_M1 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSLTU_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ugt), %0(<vscale x 2 x s32>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLT_VV_M2 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s32>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M4 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLEU_VV_M4 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ule), %0(<vscale x 8 x s32>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLE_VV_M8 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLE_VV_M8 [[DEF]], [[DEF]], -1, 5 /* e32 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sle), %0(<vscale x 16 x s32>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i64
+    ; RV32I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i64
+    ; RV64I: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: [[PseudoVMSEQ_VV_M1_:%[0-9]+]]:vr = PseudoVMSEQ_VV_M1 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY [[PseudoVMSEQ_VV_M1_]]
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 1 x s1>) = G_ICMP intpred(eq), %0(<vscale x 1 x s64>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSNE_VV_M2 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSNE_VV_M2 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 2 x s1>) = G_ICMP intpred(ne), %0(<vscale x 2 x s64>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M4 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M4 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 4 x s1>) = G_ICMP intpred(ult), %0(<vscale x 4 x s64>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M8 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVMSLTU_VV_M8 [[DEF]], [[DEF]], -1, 6 /* e64 */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:vrb(<vscale x 8 x s1>) = G_ICMP intpred(ult), %0(<vscale x 8 x s64>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir
new file mode 100644
index 0000000..382166f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/sext.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_SEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVSEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVSEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVSEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVSEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir
new file mode 100644
index 0000000..2fc9e05
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/zext.mir
@@ -0,0 +1,900 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=instruction-select -simplify-mir -verify-machineinstrs %s -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s16>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF8_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s8>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s16>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF8_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s8>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s16>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF8_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s8>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s16>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF8_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s8>) = COPY $v8
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s16>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    %1:vrb(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 4 /* e16 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    %1:vrb(<vscale x 32 x s16>) = G_ZEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_MF2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF4_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s16>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF4_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s16>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF4_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s16>) = COPY $v8
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    %1:vrb(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF4_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 5 /* e32 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    %1:vrb(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vr = PseudoVZEXT_VF2_M1 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:vrb(<vscale x 1 x s32>) = COPY $v8
+    %1:vrb(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m2 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vr = COPY $v8
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm2 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm2 = PseudoVZEXT_VF2_M2 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m2 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:vrb(<vscale x 2 x s32>) = COPY $v8
+    %1:vrb(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m4 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm2 = COPY $v8m2
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm4 = PseudoVZEXT_VF2_M4 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m4 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    %1:vrb(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV32I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV32I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV32I-NEXT: $v8m8 = COPY %1
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrm4 = COPY $v8m4
+    ; RV64I-NEXT: [[DEF:%[0-9]+]]:vrm8 = IMPLICIT_DEF
+    ; RV64I-NEXT: early-clobber %1:vrm8 = PseudoVZEXT_VF2_M8 [[DEF]], [[COPY]], -1, 6 /* e64 */, 3 /* ta, ma */
+    ; RV64I-NEXT: $v8m8 = COPY %1
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    %1:vrb(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir
new file mode 100644
index 0000000..3a2d40f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-anyext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            anyext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s8>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s16>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s8>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s16>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s8>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s16>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s8>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s16>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s8>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s16>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s8>) = G_ANYEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s16>) = G_ANYEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: anyext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v0
+    %0:_(<vscale x 64 x s8>) = G_ANYEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_ANYEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_ANYEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ANYEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ANYEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ANYEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_ANYEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ANYEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ANYEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ANYEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_ANYEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ANYEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir
new file mode 100644
index 0000000..d1df954
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-icmp.mir
@@ -0,0 +1,810 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+---
+name:            icmp_nxv1i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 1 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 1 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 2 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 2 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 4 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 4 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 8 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 8 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 16 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[DEF]](<vscale x 16 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 16 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 32 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[DEF]](<vscale x 32 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 32 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv64i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv64i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 64 x s8>), [[SELECT1]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv64i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[DEF]](<vscale x 64 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[SELECT]](<vscale x 64 x s8>), [[SELECT1]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv64i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv64i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv32i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv32i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv16i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv16i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv1i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv1i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv2i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv2i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv4i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv4i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: icmp_nxv8i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: icmp_nxv8i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0, %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir
new file mode 100644
index 0000000..1571daf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-sext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            sext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s8>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s16>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v0
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s8>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s16>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v0
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s8>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s16>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v0
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s8>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s16>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v0
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s8>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s16>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v0
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s8>) = G_SEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v0
+    %0:_(<vscale x 32 x s16>) = G_SEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0
+    ; RV32-LABEL: name: sext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v0
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v0
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v0
+    %0:_(<vscale x 64 x s8>) = G_SEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_SEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_SEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_SEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_SEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_SEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_SEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_SEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_SEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_SEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_SEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[SEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_SEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir
new file mode 100644
index 0000000..109536a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv32.mir
@@ -0,0 +1,694 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            splatvector_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv1i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 1 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv2i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 2 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 4 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv8i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 8 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv16i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 16 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv32i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv32i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 32 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv64i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv64i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[AND1]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 64 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s32)
+    %2:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+...
+
+---
+name:            splatvector_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+
+---
+name:            splatvector_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8m2 = COPY %2(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv16i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir
new file mode 100644
index 0000000..7bf5f83
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-rv64.mir
@@ -0,0 +1,817 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            splatvector_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv1i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 1 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 1 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 1 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv2i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 2 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 2 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 2 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 4 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 4 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv8i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 8 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 8 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv16i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 16 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 16 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 16 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv32i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv32i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv32i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv32i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 32 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 32 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 32 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 32 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            splatvector_nxv64i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_0
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMCLR_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMCLR_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMCLR_VL]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 0
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv64i1_1
+    ; CHECK: [[VMSET_VL:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: [[VMSET_VL1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_VMSET_VL $x0
+    ; CHECK-NEXT: $v0 = COPY [[VMSET_VL1]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s1) = G_CONSTANT i1 1
+    %1:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %0(s1)
+    $v0 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+
+...
+---
+name:            splatvector_nxv64i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: splatvector_nxv64i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 64 x s1>) = G_ICMP intpred(ne), [[SPLAT_VECTOR]](<vscale x 64 x s8>), [[SPLAT_VECTOR1]]
+    ; CHECK-NEXT: $v0 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(<vscale x 64 x s1>) = G_SPLAT_VECTOR %1(s1)
+    $v0 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v0
+...
+
+---
+name:            splatvector_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+
+...
+
+---
+name:            splatvector_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i8
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %1(s8)
+    $v8m2 = COPY %2(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(s16) = G_CONSTANT i16 0
+    %2:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR %1(s16)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv16i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv16i32
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR %1(s32)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            splatvector_nxv1i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv1i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8 = COPY %2(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv2i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m2 = COPY %2(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv4i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m4 = COPY %2(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv8i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: splatvector_nxv8i64
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR %1(s64)
+    $v8m8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir
new file mode 100644
index 0000000..806c9b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-splatvector-s64-rv32.mir
@@ -0,0 +1,116 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=HasF64 %s
+# RUN: llc -mtriple=riscv32 -mattr=+Zve64x -run-pass=legalizer %s -o - | FileCheck --check-prefix=NoF64 %s
+
+---
+name:            splatvector_nxv1i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv1i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8 = COPY [[SPLAT_VECTOR]](<vscale x 1 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv1i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 1 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            splatvector_nxv2i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv2i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m2 = COPY [[SPLAT_VECTOR]](<vscale x 2 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv2i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m2 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 2 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m2
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            splatvector_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv4i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m4 = COPY [[SPLAT_VECTOR]](<vscale x 4 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv4i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m4 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 4 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m4
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            splatvector_nxv8i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    ; HasF64-LABEL: name: splatvector_nxv8i64
+    ; HasF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; HasF64-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[DEF]](s32), [[DEF1]](s32)
+    ; HasF64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; HasF64-NEXT: $v8m8 = COPY [[SPLAT_VECTOR]](<vscale x 8 x s64>)
+    ; HasF64-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; NoF64-LABEL: name: splatvector_nxv8i64
+    ; NoF64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; NoF64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; NoF64-NEXT: [[SPLAT_VECTOR_SPLIT_I64_VL:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR_SPLIT_I64_VL [[DEF2]], [[DEF]](s32), [[DEF1]], $x0
+    ; NoF64-NEXT: $v8m8 = COPY [[SPLAT_VECTOR_SPLIT_I64_VL]](<vscale x 8 x s64>)
+    ; NoF64-NEXT: PseudoRET implicit $v8m8
+    %0:_(s64) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR %0(s64)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
index 4de02b1..8a34521 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-xor.mir
@@ -9,8 +9,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s8>) = COPY $v8
     %1:_(<vscale x 1 x s8>) = COPY $v9
@@ -27,8 +27,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = COPY $v8
     %1:_(<vscale x 2 x s8>) = COPY $v9
@@ -45,8 +45,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 4 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s8>) = COPY $v8
     %1:_(<vscale x 4 x s8>) = COPY $v9
@@ -63,8 +63,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 8 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s8>) = COPY $v8
     %1:_(<vscale x 8 x s8>) = COPY $v9
@@ -81,8 +81,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 16 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 16 x s8>) = COPY $v8m2
     %1:_(<vscale x 16 x s8>) = COPY $v10m2
@@ -99,8 +99,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv32i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 32 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 32 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 32 x s8>) = COPY $v8m4
     %1:_(<vscale x 32 x s8>) = COPY $v12m4
@@ -117,8 +117,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv64i8
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 64 x s8>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 64 x s8>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 64 x s8>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 64 x s8>) = COPY $v8m8
     %1:_(<vscale x 64 x s8>) = COPY $v16m8
@@ -135,8 +135,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s16>) = COPY $v8
     %1:_(<vscale x 1 x s16>) = COPY $v9
@@ -153,8 +153,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s16>) = COPY $v8
     %1:_(<vscale x 2 x s16>) = COPY $v9
@@ -171,8 +171,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 4 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = COPY $v8
     %1:_(<vscale x 4 x s16>) = COPY $v9
@@ -189,8 +189,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 8 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 8 x s16>) = COPY $v8m2
     %1:_(<vscale x 8 x s16>) = COPY $v10m2
@@ -207,8 +207,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 16 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 16 x s16>) = COPY $v8m4
     %1:_(<vscale x 16 x s16>) = COPY $v12m4
@@ -225,8 +225,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv32i16
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 32 x s16>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 32 x s16>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 32 x s16>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 32 x s16>) = COPY $v8m8
     %1:_(<vscale x 32 x s16>) = COPY $v16m8
@@ -243,8 +243,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s32>) = COPY $v8
     %1:_(<vscale x 1 x s32>) = COPY $v9
@@ -261,8 +261,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 2 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s32>) = COPY $v8
     %1:_(<vscale x 2 x s32>) = COPY $v9
@@ -279,8 +279,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 4 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 4 x s32>) = COPY $v8m2
     %1:_(<vscale x 4 x s32>) = COPY $v10m2
@@ -297,8 +297,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 8 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 8 x s32>) = COPY $v8m4
     %1:_(<vscale x 8 x s32>) = COPY $v12m4
@@ -315,8 +315,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv16i32
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 16 x s32>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 16 x s32>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 16 x s32>) = COPY $v8m8
     %1:_(<vscale x 16 x s32>) = COPY $v16m8
@@ -333,8 +333,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv1i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 1 x s64>) = COPY $v9
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8 = COPY [[OR]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8 = COPY [[XOR]](<vscale x 1 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 1 x s64>) = COPY $v8
     %1:_(<vscale x 1 x s64>) = COPY $v9
@@ -351,8 +351,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv2i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v8m2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 2 x s64>) = COPY $v10m2
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m2 = COPY [[OR]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m2 = COPY [[XOR]](<vscale x 2 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m2
     %0:_(<vscale x 2 x s64>) = COPY $v8m2
     %1:_(<vscale x 2 x s64>) = COPY $v10m2
@@ -369,8 +369,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv4i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v12m4
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m4 = COPY [[OR]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m4 = COPY [[XOR]](<vscale x 4 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m4
     %0:_(<vscale x 4 x s64>) = COPY $v8m4
     %1:_(<vscale x 4 x s64>) = COPY $v12m4
@@ -387,8 +387,8 @@ body:             |
     ; CHECK-LABEL: name: test_nxv8i64
     ; CHECK: [[COPY:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v8m8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<vscale x 8 x s64>) = COPY $v16m8
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $v8m8 = COPY [[OR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_XOR [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $v8m8 = COPY [[XOR]](<vscale x 8 x s64>)
     ; CHECK-NEXT: PseudoRET implicit $v8m8
     %0:_(<vscale x 8 x s64>) = COPY $v8m8
     %1:_(<vscale x 8 x s64>) = COPY $v16m8
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir
new file mode 100644
index 0000000..fe4ddfa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-zext.mir
@@ -0,0 +1,1589 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV32 %s
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck --check-prefix=RV64 %s
+
+# Extend from s1 element vectors
+---
+name:            zext_nxv1i8_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i8_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i8_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s8>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i16_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i16_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i16_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i32_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_SELECT [[COPY]](<vscale x 1 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s1>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s1>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i8_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i8_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i8_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s8>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i16_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i16_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i16_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_SELECT [[COPY]](<vscale x 2 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s1>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s1>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i8_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i8_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i8_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s8>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i16_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i16_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i16_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i32_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s1>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s1>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i8_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i8_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i8_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[SELECT]](<vscale x 8 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s8>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv8i16_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i16_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv8i16_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv8i32_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C2]](s32), [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[MV1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s1>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s1>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i8_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i8_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv16i8_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s8>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m2 = COPY [[SELECT]](<vscale x 16 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s8>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv16i16_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i16_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv16i16_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s16>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv16i32_nxv16i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_SELECT [[COPY]](<vscale x 16 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s1>) = COPY $v8
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s1>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv32i8_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i8_nxv32i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv32i8_nxv32i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s8>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m4 = COPY [[SELECT]](<vscale x 32 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 32 x s1>) = COPY $v8
+    %0:_(<vscale x 32 x s8>) = G_ZEXT %1(<vscale x 32 x s1>)
+    $v8m4 = COPY %0(<vscale x 32 x s8>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv32i16_nxv32i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i16_nxv32i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv32i16_nxv32i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_SELECT [[COPY]](<vscale x 32 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s1>) = COPY $v8
+    %0:_(<vscale x 32 x s16>) = G_ZEXT %1(<vscale x 32 x s1>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv64i8_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv64i8_nxv64i1
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv64i8_nxv64i1
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 64 x s8>) = G_SELECT [[COPY]](<vscale x 64 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8m8 = COPY [[SELECT]](<vscale x 64 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 64 x s1>) = COPY $v8
+    %0:_(<vscale x 64 x s8>) = G_ZEXT %1(<vscale x 64 x s1>)
+    $v8m8 = COPY %0(<vscale x 64 x s8>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s8 element vectors
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s16>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s8>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s16>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s8>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s8>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s16>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s8>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s8>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s16>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s8>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s8>) = COPY $v8
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s8>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s16>) = G_ZEXT %1(<vscale x 16 x s8>)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s8>) = COPY $v8m2
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s8>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 32 x s8>) = COPY $v8m4
+    %0:_(<vscale x 32 x s16>) = G_ZEXT %1(<vscale x 32 x s8>)
+    $v8m8 = COPY %0(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s16 element vectors
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s32>) = G_ZEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s16>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s32>) = G_ZEXT %1(<vscale x 2 x s16>)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s16>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s16>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s32>) = G_ZEXT %1(<vscale x 4 x s16>)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s16>) = COPY $v8
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s16>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s16>) = COPY $v8m2
+    %0:_(<vscale x 8 x s32>) = G_ZEXT %1(<vscale x 8 x s16>)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s16>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s16>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s16>) = COPY $v8m4
+    %0:_(<vscale x 16 x s32>) = G_ZEXT %1(<vscale x 16 x s16>)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+
+# Extend from s32 element vectors
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 1 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = COPY $v8
+    %0:_(<vscale x 1 x s64>) = G_ZEXT %1(<vscale x 1 x s32>)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s32>) = COPY $v8
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s32>) = COPY $v8
+    %0:_(<vscale x 2 x s64>) = G_ZEXT %1(<vscale x 2 x s32>)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s32>) = COPY $v8m2
+    %0:_(<vscale x 4 x s64>) = G_ZEXT %1(<vscale x 4 x s32>)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+    ; RV32-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s32>) = COPY $v8m4
+    %0:_(<vscale x 8 x s64>) = G_ZEXT %1(<vscale x 8 x s32>)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir
new file mode 100644
index 0000000..062179c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/anyext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            anyext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ANYEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_ANYEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ANYEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ANYEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            anyext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ANYEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[ANYEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ANYEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            anyext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: anyext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ANYEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ANYEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ANYEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            anyext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: anyext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ANYEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ANYEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_ANYEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            anyext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: anyext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ANYEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ANYEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ANYEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir
new file mode 100644
index 0000000..925d6ae
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/icmp.mir
@@ -0,0 +1,675 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            icmp_nxv1i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s1>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s1>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s1>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s1>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s1>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s1>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i1
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s1>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i1
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s1>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 64 x s1>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s8>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s8>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s8>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s8>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s8>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s8>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv64i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv64i8
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv64i8
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 64 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 64 x s8>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 64 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 64 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 64 x s8>), %0
+    $v8 = COPY %1(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s16>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s16>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s16>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s16>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s16>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv32i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv32i16
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv32i16
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 32 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 32 x s16>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 32 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 32 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 32 x s16>), %0
+    $v8 = COPY %1(<vscale x 32 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s32>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s32>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s32>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s32>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv16i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv16i32
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv16i32
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 16 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 16 x s32>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 16 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 16 x s32>), %0
+    $v8 = COPY %1(<vscale x 16 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv1i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv1i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv1i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 1 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 1 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 1 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 1 x s64>), %0
+    $v8 = COPY %1(<vscale x 1 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv2i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv2i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv2i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 2 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 2 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 2 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 2 x s64>), %0
+    $v8 = COPY %1(<vscale x 2 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv4i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv4i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv4i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 4 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 4 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 4 x s64>), %0
+    $v8 = COPY %1(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            icmp_nxv8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32I-LABEL: name: icmp_nxv8i64
+    ; RV32I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV32I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: icmp_nxv8i64
+    ; RV64I: [[DEF:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:vrb(<vscale x 8 x s1>) = G_ICMP intpred(sgt), [[DEF]](<vscale x 8 x s64>), [[DEF]]
+    ; RV64I-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 8 x s1>) = G_ICMP intpred(sgt), %0(<vscale x 8 x s64>), %0
+    $v8 = COPY %1(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir
new file mode 100644
index 0000000..a754b8b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/sext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            sext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_SEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_SEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_SEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_SEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            sext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: sext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_SEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[SEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_SEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            sext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: sext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_SEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[SEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_SEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            sext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: sext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_SEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[SEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_SEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            sext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: sext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[SEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_SEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[SEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_SEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir
new file mode 100644
index 0000000..c3bc4a9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/rvv/zext.mir
@@ -0,0 +1,820 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV32I %s
+# RUN: llc -mtriple=riscv64 -mattr=+m,+v -run-pass=regbankselect \
+# RUN:   -disable-gisel-legality-check -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck -check-prefix=RV64I %s
+
+---
+name:            zext_nxv1i16_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i16_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s16>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s16>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i32_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s8>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s8>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i16_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i16_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s16>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s16>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s8>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s8>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i16_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i16_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s16>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s16>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8 = COPY %1(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv4i32_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s8>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i16_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i16_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s16>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s16>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m2 = COPY %1(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv8i32_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s8>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s8>) = COPY $v8
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s8>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i16_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i16_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s16>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s16>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m4 = COPY %1(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv16i32_nxv16i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s8>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s8>) = COPY $v8m2
+    %1:_(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s8>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv32i16_nxv32i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv32i16_nxv32i8
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 32 x s8>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 32 x s16>) = G_ZEXT [[COPY]](<vscale x 32 x s8>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 32 x s16>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 32 x s8>) = COPY $v8m4
+    %1:_(<vscale x 32 x s16>) = G_ZEXT %0(<vscale x 32 x s8>)
+    $v8m8 = COPY %1(<vscale x 32 x s16>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i32_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i32_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s32>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s32>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv1i64_nxv1i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s16>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s16>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i32_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i32_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s32>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s32>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s16>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s16>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i32_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i32_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s32>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s32>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s16>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s16>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i32_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i32_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s32>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 8 x s16>) = COPY $v8m2
+    %1:_(<vscale x 8 x s32>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s16>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s16>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv16i32_nxv16i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv16i32_nxv16i16
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 16 x s16>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 16 x s32>) = G_ZEXT [[COPY]](<vscale x 16 x s16>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 16 x s32>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 16 x s16>) = COPY $v8m4
+    %1:_(<vscale x 16 x s32>) = G_ZEXT %0(<vscale x 16 x s16>)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+
+...
+---
+name:            zext_nxv1i64_nxv1i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV32I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64I-LABEL: name: zext_nxv1i64_nxv1i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 1 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 1 x s64>) = G_ZEXT [[COPY]](<vscale x 1 x s32>)
+    ; RV64I-NEXT: $v8 = COPY [[ZEXT]](<vscale x 1 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 1 x s32>) = COPY $v8
+    %1:_(<vscale x 1 x s64>) = G_ZEXT %0(<vscale x 1 x s32>)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            zext_nxv2i64_nxv2i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV32I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m2
+    ;
+    ; RV64I-LABEL: name: zext_nxv2i64_nxv2i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 2 x s32>) = COPY $v8
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 2 x s64>) = G_ZEXT [[COPY]](<vscale x 2 x s32>)
+    ; RV64I-NEXT: $v8m2 = COPY [[ZEXT]](<vscale x 2 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 2 x s32>) = COPY $v8
+    %1:_(<vscale x 2 x s64>) = G_ZEXT %0(<vscale x 2 x s32>)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+
+...
+---
+name:            zext_nxv4i64_nxv4i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV32I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m4
+    ;
+    ; RV64I-LABEL: name: zext_nxv4i64_nxv4i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 4 x s32>) = COPY $v8m2
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 4 x s64>) = G_ZEXT [[COPY]](<vscale x 4 x s32>)
+    ; RV64I-NEXT: $v8m4 = COPY [[ZEXT]](<vscale x 4 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(<vscale x 4 x s64>) = G_ZEXT %0(<vscale x 4 x s32>)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+
+...
+---
+name:            zext_nxv8i64_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8
+
+    ; RV32I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV32I: liveins: $v8
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV32I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV32I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV32I-NEXT: PseudoRET implicit $v8m8
+    ;
+    ; RV64I-LABEL: name: zext_nxv8i64_nxv8i32
+    ; RV64I: liveins: $v8
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:vrb(<vscale x 8 x s32>) = COPY $v8m4
+    ; RV64I-NEXT: [[ZEXT:%[0-9]+]]:vrb(<vscale x 8 x s64>) = G_ZEXT [[COPY]](<vscale x 8 x s32>)
+    ; RV64I-NEXT: $v8m8 = COPY [[ZEXT]](<vscale x 8 x s64>)
+    ; RV64I-NEXT: PseudoRET implicit $v8m8
+    %0:_(<vscale x 8 x s32>) = COPY $v8m4
+    %1:_(<vscale x 8 x s64>) = G_ZEXT %0(<vscale x 8 x s32>)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index bafa92e..65d0768 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -18,14 +18,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v11, v8
+; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 16
+; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_nxv4i32:
@@ -41,14 +39,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v11, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 48
+; RV64-NEXT:    srli a0, a0, 48
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
   ret i32 %res
@@ -158,8 +154,7 @@ define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vand.vv v8, v16, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
index f5305a1..83d1d1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll
@@ -19,10 +19,9 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = sext <4 x i32> %b to <4 x i64>
@@ -41,10 +40,9 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %y = zext <4 x i32> %b to <4 x i64>
@@ -62,9 +60,9 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -88,10 +86,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -116,10 +112,8 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer
@@ -142,12 +136,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -170,12 +161,9 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer
@@ -198,12 +186,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -226,12 +211,9 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <4 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer
@@ -251,9 +233,9 @@ define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <4 x i32> %a to <4 x i64>
   %z = shl <4 x i64> %x, splat (i64 2)
@@ -275,10 +257,9 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = sext <8 x i16> %b to <8 x i32>
@@ -297,10 +278,9 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %y = zext <8 x i16> %b to <8 x i32>
@@ -318,9 +298,9 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i64> poison, i64 %b, i32 0
   %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer
@@ -340,9 +320,9 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer
@@ -366,10 +346,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -394,10 +372,8 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer
@@ -420,12 +396,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -448,12 +421,9 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <8 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer
@@ -473,9 +443,9 @@ define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <8 x i16> %a to <8 x i32>
   %z = shl <8 x i32> %x, splat (i32 2)
@@ -497,10 +467,9 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = sext <16 x i8> %b to <16 x i16>
@@ -519,10 +488,9 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %y = zext <16 x i8> %b to <16 x i16>
@@ -552,12 +520,9 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-ZVBB-NEXT:    vmv.v.x v12, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v8
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i32> poison, i32 %b, i32 0
   %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer
@@ -577,9 +542,9 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vx v10, v8, a0
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer
@@ -603,10 +568,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -631,10 +594,8 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) {
 ; CHECK-ZVBB:       # %bb.0:
 ; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
-; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
-; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    vwsll.vv v10, v8, v9
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %head = insertelement <16 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer
@@ -654,9 +615,9 @@ define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) {
 ;
 ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16:
 ; CHECK-ZVBB:       # %bb.0:
-; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-ZVBB-NEXT:    vzext.vf2 v10, v8
-; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vwsll.vi v10, v8, 2
+; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <16 x i8> %a to <16 x i16>
   %z = shl <16 x i16> %x, splat (i16 2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
new file mode 100644
index 0000000..3a8d08f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binop-into-select.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+; The following binop x, (zext i1) tests will be vector-legalized into a vselect
+; of two splat_vectors, but on RV64 the splat value will be implicitly
+; truncated:
+;
+;       t15: nxv2i32 = splat_vector Constant:i64<1>
+;       t13: nxv2i32 = splat_vector Constant:i64<0>
+;     t16: nxv2i32 = vselect t2, t15, t13
+;   t7: nxv2i32 = add t4, t16
+;
+; Make sure that foldSelectWithIdentityConstant in DAGCombiner.cpp handles the
+; truncating splat, so we pull the vselect back and fold it into a mask.
+
+define <vscale x 2 x i32> @i1_zext_add(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_add_commuted(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_add_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %add = add <vscale x 2 x i32> %zext, %b
+  ret <vscale x 2 x i32> %add
+}
+
+define <vscale x 2 x i32> @i1_zext_sub(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_sub:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %sub = sub <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %sub
+}
+
+define <vscale x 2 x i32> @i1_zext_or(<vscale x 2 x i1> %a, <vscale x 2 x i32> %b) {
+; CHECK-LABEL: i1_zext_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    ret
+  %zext = zext <vscale x 2 x i1> %a to <vscale x 2 x i32>
+  %or = or <vscale x 2 x i32> %b, %zext
+  ret <vscale x 2 x i32> %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index e56dca0..a14ce71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -149,49 +149,49 @@ define <vscale x 2 x i64> @vwop_vscale_sext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v8, (a0)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vlm.v v10, (a2)
-; RV32-NEXT:    vmv.v.i v11, 0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV32-NEXT:    vmv.v.v v0, v10
-; RV32-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV32-NEXT:    vmul.vv v9, v12, v9
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsub.vv v11, v12, v10
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vsub.vx v10, v10, a0, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v10
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v8, (a0)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vlm.v v10, (a2)
+; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v10
+; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_sext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v8, (a0)
-; RV64-NEXT:    vlm.v v9, (a1)
-; RV64-NEXT:    vlm.v v10, (a2)
-; RV64-NEXT:    vmv.v.i v11, 0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v12, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v11, -1, v0
-; RV64-NEXT:    vmv.v.v v0, v10
-; RV64-NEXT:    vmerge.vim v10, v11, -1, v0
-; RV64-NEXT:    vmul.vv v9, v12, v9
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v11, 1, v0
-; RV64-NEXT:    vsub.vv v8, v10, v8
-; RV64-NEXT:    vsub.vv v10, v12, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_sext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v8, (a0)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vlm.v v10, (a2)
+; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vmerge.vim v9, v11, -1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v10
+; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
+; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -209,7 +209,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v8, (a0)
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
@@ -221,17 +221,17 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
+; NO_FOLDING-NEXT:    li a0, 1
+; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
-; NO_FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; NO_FOLDING-NEXT:    vsub.vv v8, v10, v8
-; NO_FOLDING-NEXT:    vsub.vv v10, v12, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v9, v8
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
+; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v9, v10
+; NO_FOLDING-NEXT:    vor.vv v8, v8, v11
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_sext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v8, (a0)
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
@@ -243,12 +243,12 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vmv1r.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
+; FOLDING-NEXT:    li a0, 1
+; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv1r.v v0, v8
-; FOLDING-NEXT:    vmerge.vim v8, v11, 1, v0
-; FOLDING-NEXT:    vsub.vv v8, v10, v8
-; FOLDING-NEXT:    vsub.vv v10, v12, v10
-; FOLDING-NEXT:    vor.vv v8, v9, v8
-; FOLDING-NEXT:    vor.vv v8, v8, v10
+; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
+; FOLDING-NEXT:    vor.vv v8, v9, v10
+; FOLDING-NEXT:    vor.vv v8, v8, v11
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -444,41 +444,39 @@ define <vscale x 2 x i64> @vwop_vscale_zext_i32i64_multiple_users(ptr %x, ptr %y
 }
 
 define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ptr %z) {
-; RV32-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
-; RV32-NEXT:    vlm.v v0, (a0)
-; RV32-NEXT:    vlm.v v8, (a2)
-; RV32-NEXT:    vlm.v v9, (a1)
-; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV32-NEXT:    vmv.v.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV32-NEXT:    vadd.vv v10, v11, v8
-; RV32-NEXT:    vsub.vv v8, v11, v8
-; RV32-NEXT:    vmv.v.v v0, v9
-; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    ret
+; NO_FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; NO_FOLDING:       # %bb.0:
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; NO_FOLDING-NEXT:    vlm.v v0, (a0)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
+; NO_FOLDING-NEXT:    vmv.v.i v10, 0
+; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; NO_FOLDING-NEXT:    vmv.v.v v0, v8
+; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
+; NO_FOLDING-NEXT:    vmv.v.v v0, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
+; NO_FOLDING-NEXT:    ret
 ;
-; RV64-LABEL: vwop_vscale_zext_i1i32_multiple_users:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vlm.v v0, (a0)
-; RV64-NEXT:    vlm.v v8, (a1)
-; RV64-NEXT:    vlm.v v9, (a2)
-; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    vmerge.vim v11, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v10, 1, v0
-; RV64-NEXT:    vmv.v.v v0, v9
-; RV64-NEXT:    vmerge.vim v9, v10, 1, v0
-; RV64-NEXT:    vmul.vv v8, v11, v8
-; RV64-NEXT:    vadd.vv v10, v11, v9
-; RV64-NEXT:    vsub.vv v9, v11, v9
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    ret
+; FOLDING-LABEL: vwop_vscale_zext_i1i32_multiple_users:
+; FOLDING:       # %bb.0:
+; FOLDING-NEXT:    vsetvli a3, zero, e32, m1, ta, mu
+; FOLDING-NEXT:    vlm.v v0, (a0)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
+; FOLDING-NEXT:    vmv.v.i v10, 0
+; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
+; FOLDING-NEXT:    vmv.v.v v0, v8
+; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
+; FOLDING-NEXT:    vmv.v.v v0, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
+; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
   %b2 = load <vscale x 2 x i1>, ptr %z
@@ -496,40 +494,36 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y,
 define <vscale x 2 x i8> @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, ptr %z) {
 ; NO_FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; NO_FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; NO_FOLDING-NEXT:    vlm.v v0, (a0)
-; NO_FOLDING-NEXT:    vlm.v v8, (a1)
-; NO_FOLDING-NEXT:    vlm.v v9, (a2)
+; NO_FOLDING-NEXT:    vlm.v v8, (a2)
+; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vmv.v.i v10, 0
 ; NO_FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; NO_FOLDING-NEXT:    vadd.vv v10, v11, v8
+; NO_FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
-; NO_FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; NO_FOLDING-NEXT:    vmul.vv v8, v11, v8
-; NO_FOLDING-NEXT:    vadd.vv v10, v11, v9
-; NO_FOLDING-NEXT:    vsub.vv v9, v11, v9
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v10
-; NO_FOLDING-NEXT:    vor.vv v8, v8, v9
+; NO_FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; NO_FOLDING-NEXT:    vor.vv v8, v10, v8
 ; NO_FOLDING-NEXT:    ret
 ;
 ; FOLDING-LABEL: vwop_vscale_zext_i1i8_multiple_users:
 ; FOLDING:       # %bb.0:
-; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; FOLDING-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
 ; FOLDING-NEXT:    vlm.v v0, (a0)
-; FOLDING-NEXT:    vlm.v v8, (a1)
-; FOLDING-NEXT:    vlm.v v9, (a2)
+; FOLDING-NEXT:    vlm.v v8, (a2)
+; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vmv.v.i v10, 0
 ; FOLDING-NEXT:    vmerge.vim v11, v10, 1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v8, v10, 1, v0
+; FOLDING-NEXT:    vadd.vv v10, v11, v8
+; FOLDING-NEXT:    vsub.vv v8, v11, v8
 ; FOLDING-NEXT:    vmv1r.v v0, v9
-; FOLDING-NEXT:    vmerge.vim v9, v10, 1, v0
-; FOLDING-NEXT:    vmul.vv v8, v11, v8
-; FOLDING-NEXT:    vadd.vv v10, v11, v9
-; FOLDING-NEXT:    vsub.vv v9, v11, v9
-; FOLDING-NEXT:    vor.vv v8, v8, v10
-; FOLDING-NEXT:    vor.vv v8, v8, v9
+; FOLDING-NEXT:    vor.vv v10, v10, v11, v0.t
+; FOLDING-NEXT:    vor.vv v8, v10, v8
 ; FOLDING-NEXT:    ret
   %a = load <vscale x 2 x i1>, ptr %x
   %b = load <vscale x 2 x i1>, ptr %y
@@ -594,3 +588,6 @@ define <vscale x 2 x i32> @vwop_vscale_zext_i8i32_multiple_users(ptr %x, ptr %y,
 
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 0d52dd7..0a5e501 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -825,3 +825,56 @@ define <vscale x 2 x i1> @select_cond_x_cond(<vscale x 2 x i1> %x, <vscale x 2 x
   %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %x, i32 %evl)
   ret <vscale x 2 x i1> %a
 }
+
+define <vscale x 2 x i1> @select_undef_T_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_undef_T_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_undef_undef_F(<vscale x 2 x i1> %x, i32 zeroext %evl) {
+; CHECK-LABEL: select_undef_undef_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> poison, <vscale x 2 x i1> undef, <vscale x 2 x i1> %x, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_undef_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_undef_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> undef, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_T_undef(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_T_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> poison, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_false_T_F(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %z, i32 zeroext %evl) {
+; CHECK-LABEL: select_false_T_F:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> %y, <vscale x 2 x i1> %z, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
+
+define <vscale x 2 x i1> @select_unknown_T_T(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, i32 zeroext %evl) {
+; CHECK-LABEL: select_unknown_T_T:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> %y, i32 %evl)
+  ret <vscale x 2 x i1> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 770bb56..082de2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -627,3 +627,259 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
   %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
+
+; ==============================================================================
+; i8 -> i64
+; ==============================================================================
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vv_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) {
+; CHECK-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vv_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %b to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i64_nxv2i64_nxv2i8(<vscale x 2 x i8> %a, i64 %b) {
+; CHECK-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vx v8, v10, a0
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i64_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vx v8, v10, a0
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %splat
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i32_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i32 %b) {
+; CHECK-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf2 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i32_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf2 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i32> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i16_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i16 %b) {
+; CHECK-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf4 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i16_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf4 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i16> %head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i16> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_sext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_sext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = sext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vx_i8_nxv2i64_nxv2i8_zext(<vscale x 2 x i8> %a, i8 %b) {
+; CHECK-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vzext.vf8 v12, v9
+; CHECK-NEXT:    vsll.vv v8, v10, v12
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vx_i8_nxv2i64_nxv2i8_zext:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vmv.v.x v9, a0
+; CHECK-ZVBB-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vzext.vf8 v12, v9
+; CHECK-ZVBB-NEXT:    vsll.vv v8, v10, v12
+; CHECK-ZVBB-NEXT:    ret
+  %head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %splat = shufflevector <vscale x 2 x i8> %head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %y = zext <vscale x 2 x i8> %splat to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, %y
+  ret <vscale x 2 x i64> %z
+}
+
+define <vscale x 2 x i64> @vwsll_vi_nxv2i64_nxv2i8(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vzext.vf8 v10, v8
+; CHECK-NEXT:    vsll.vi v8, v10, 2
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vwsll_vi_nxv2i64_nxv2i8:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-ZVBB-NEXT:    vzext.vf8 v10, v8
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v10, 2
+; CHECK-ZVBB-NEXT:    ret
+  %x = zext <vscale x 2 x i8> %a to <vscale x 2 x i64>
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
+  ret <vscale x 2 x i64> %z
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
index 5bf2adb..07eb67d 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-bad.ll
@@ -11,3 +11,12 @@ entry:
   tail call void asm sideeffect "faddq $0,$1,$2", "{f38},{f0},{f0}"(fp128 0xL0, fp128 0xL0, fp128 0xL0)
   ret void
 }
+
+; CHECK-label:test_twinword_error
+; CHECK: error: Hi part of pair should point to an even-numbered register
+; CHECK: error: (note that in some cases it might be necessary to manually bind the input/output registers instead of relying on automatic allocation)
+
+define i64 @test_twinword_error(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i1}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPARC/inlineasm.ll b/llvm/test/CodeGen/SPARC/inlineasm.ll
index ec27598..9817d7c 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm.ll
@@ -143,3 +143,12 @@ entry:
   %1 = call double asm sideeffect "faddd $1, $2, $0", "=f,f,e"(i64 0, i64 0)
   ret void
 }
+
+; CHECK-label:test_twinword
+; CHECK: rd  %asr5, %i1
+; CHECK: srlx %i1, 32, %i0
+
+define i64 @test_twinword(){
+  %1 = tail call i64 asm sideeffect "rd %asr5, ${0:L} \0A\09 srlx ${0:L}, 32, ${0:H}", "={i0}"()
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
index a4ca3aa..6057bf38 100644
--- a/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
+++ b/llvm/test/CodeGen/SPIRV/OpVariable_order.ll
@@ -1,10 +1,14 @@
-; REQUIRES: spirv-tools
-; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | not spirv-val 2>&1 | FileCheck %s
+; All OpVariable instructions in a function must be the first instructions in the first block
 
-; TODO(#66261): The SPIR-V backend should reorder OpVariable instructions so this doesn't fail,
-;     but in the meantime it's a good example of the spirv-val tool working as intended.
+; RUN: llc -O0 -mtriple=spirv-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-linux %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: All OpVariable instructions in a function must be the first instructions in the first block.
+; CHECK-SPIRV: OpFunction
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV-NEXT: OpVariable
+; CHECK-SPIRV: OpReturn
+; CHECK-SPIRV: OpFunctionEnd
 
 define void @main() #1 {
 entry:
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
index 1071d34..b039f80 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-chain.ll
@@ -10,22 +10,46 @@
 ; CHECK-SPIRV-DAG: OpName %[[FooObj:.*]] "foo_object"
 ; CHECK-SPIRV-DAG: OpName %[[FooMemOrder:.*]] "mem_order"
 ; CHECK-SPIRV-DAG: OpName %[[FooFunc:.*]] "foo"
+
 ; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer CrossWorkgroup %[[TyLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunPtrLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtrLong]]
-; CHECK-SPIRV-DAG: %[[TyGenPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrLong:.*]] = OpTypePointer Generic %[[TyGenPtrLong]]
 ; CHECK-SPIRV-DAG: %[[TyFunGenPtrLongLong:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrLong]] %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
 ; CHECK-SPIRV-DAG: %[[Const3:.*]] = OpConstant %[[TyLong]] 3
+
 ; CHECK-SPIRV: %[[FunTest]] = OpFunction %[[TyVoid]] None %[[TyFunPtrLong]]
 ; CHECK-SPIRV: %[[ArgCum]] = OpFunctionParameter %[[TyPtrLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[Addr]] %[[Const3]]
+
+; CHECK-SPIRV: %[[HalfAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[HalfAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[HalfAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[HalfAddrCasted]] %[[Const3]]
+
+; CHECK-SPIRV: %[[DblAddr:.*]] = OpPtrCastToGeneric
+; CHECK-SPIRV-NEXT: %[[DblAddrCasted:.*]] = OpBitcast %[[TyGenPtrLong]] %[[DblAddr]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[FooFunc]] %[[DblAddrCasted]] %[[Const3]]
+
 ; CHECK-SPIRV: %[[FooStub]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[StubObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[MemOrder]] = OpFunctionParameter %[[TyLong]]
+
+; CHECK-SPIRV: %[[ObjectAddr:.*]] = OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: %[[ToGeneric:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[ObjectAddr]]
+; CHECK-SPIRV-NEXT: %[[Casted:.*]] = OpBitcast %[[TyGenPtrPtrLong]] %[[ToGeneric]]
+; CHECK-SPIRV-NEXT: OpStore %[[Casted]] %[[StubObj]]
+
 ; CHECK-SPIRV: %[[FooFunc]] = OpFunction %[[TyVoid]] None %[[TyFunGenPtrLongLong]]
 ; CHECK-SPIRV: %[[FooObj]] = OpFunctionParameter %[[TyGenPtrLong]]
 ; CHECK-SPIRV: %[[FooMemOrder]] = OpFunctionParameter %[[TyLong]]
+
 ; CHECK-SPIRV: OpFunctionCall %[[TyVoid]] %[[FooStub]] %[[FooObj]] %[[FooMemOrder]]
 
 define spir_kernel void @test(ptr addrspace(1) noundef align 4 %_arg_cum) {
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
new file mode 100644
index 0000000..edb31ff
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-call-no-bitcast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: %[[TyChar:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[TyGenPtrChar:.*]] = OpTypePointer Generic %[[TyChar]]
+; CHECK-SPIRV-DAG: %[[TyFunBar:.*]] = OpTypeFunction %[[TyVoid]] %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyLong:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[TyGenPtrPtrChar:.*]] = OpTypePointer Generic %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyFunFoo:.*]] = OpTypeFunction %[[TyVoid]] %[[TyLong]] %[[TyGenPtrPtrChar]] %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyLong]]
+; CHECK-SPIRV-DAG: %[[Const100:.*]] = OpConstant %[[TyLong]] 100
+; CHECK-SPIRV-DAG: %[[TyFunPtrGenPtrChar:.*]] = OpTypePointer Function %[[TyGenPtrChar]]
+; CHECK-SPIRV-DAG: %[[TyPtrStruct:.*]] = OpTypePointer Generic %[[TyStruct]]
+; CHECK-SPIRV-DAG: %[[TyPtrLong:.*]] = OpTypePointer Generic %[[TyLong]]
+
+; CHECK-SPIRV: %[[Bar:.*]] = OpFunction %[[TyVoid]] None %[[TyFunBar]]
+; CHECK-SPIRV: %[[BarArg:.*]] = OpFunctionParameter %[[TyGenPtrChar]]
+; CHECK-SPIRV-NEXT: OpLabel
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV-NEXT: OpVariable %[[TyFunPtrGenPtrChar]] Function
+; CHECK-SPIRV: %[[Var1:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: %[[Var2:.*]] = OpPtrCastToGeneric %[[TyGenPtrPtrChar]] %[[#]]
+; CHECK-SPIRV: OpStore %[[#]] %[[BarArg]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var1]] %[[Var2]]
+; CHECK-SPIRV-NEXT: OpFunctionCall %[[TyVoid]] %[[Foo]] %[[Const100]] %[[Var2]] %[[Var1]]
+
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[TyVoid]] None %[[TyFunFoo]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyLong]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+; CHECK-SPIRV-NEXT: OpFunctionParameter %[[TyGenPtrPtrChar]]
+
+%class.CustomType = type { i64 }
+
+define linkonce_odr dso_local spir_func void @bar(ptr addrspace(4) noundef %first) {
+entry:
+  %first.addr = alloca ptr addrspace(4)
+  %first.addr.ascast = addrspacecast ptr %first.addr to ptr addrspace(4)
+  %temp = alloca ptr addrspace(4), align 8
+  %temp.ascast = addrspacecast ptr %temp to ptr addrspace(4)
+  store ptr addrspace(4) %first, ptr %first.addr
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast)
+  call spir_func void @foo(i64 noundef 100, ptr addrspace(4) noundef dereferenceable(8) %temp.ascast, ptr addrspace(4) noundef dereferenceable(8) %first.addr.ascast)
+  %var = alloca ptr addrspace(4), align 8
+  ret void
+}
+
+define linkonce_odr dso_local spir_func void @foo(i64 noundef %offset, ptr addrspace(4) noundef dereferenceable(8) %in_acc1, ptr addrspace(4) noundef dereferenceable(8) %out_acc1) {
+entry:
+  %r0 = load ptr addrspace(4), ptr addrspace(4) %in_acc1
+  %arrayidx = getelementptr inbounds %class.CustomType, ptr addrspace(4) %r0, i64 42
+  %r1 = load i64, ptr addrspace(4) %arrayidx
+  %r3 = load ptr addrspace(4), ptr addrspace(4) %out_acc1
+  %r4 = getelementptr %class.CustomType, ptr addrspace(4) %r3, i64 43
+  store i64 %r1, ptr addrspace(4) %r4
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/WebAssembly/multi-return.ll b/llvm/test/CodeGen/WebAssembly/multi-return.ll
index 3429cd5..293a1b3 100644
--- a/llvm/test/CodeGen/WebAssembly/multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/multi-return.ll
@@ -78,18 +78,16 @@ define i64 @test4() {
 define { i64, i128 } @test5() {
 ; CHECK-LABEL: test5:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push8=, 8
-; CHECK: i32.add 	$push9=, $[[SP:[0-9]+]], $pop8
-; CHECK: i32.const	$push0=, 16
-; CHECK: i32.add 	$push1=, $pop9, $pop0
+; CHECK: i32.const	$push0=, 24
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
 ; CHECK: i64.load	$push2=, 16($[[SP]])
 ; CHECK: i64.store	8($0), $pop2
+; CHECK: i64.store	16($0), $[[L1]]
 ; CHECK: i64.store	0($0), $[[L2]]
-; CHECK: i32.const	$push12=, 16
-; CHECK: i32.add 	$push3=, $0, $pop12
-; CHECK: i64.store	0($pop3), $[[L1]]
+; CHECK: i32.const	$push5=, 80
+; CHECK: i32.add 	$push6=, $3, $pop5
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
@@ -101,20 +99,20 @@ define { i64, i128 } @test5() {
 define { i128, i128 } @test6() {
 ; CHECK-LABEL: test6:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push0=, 64
+; CHECK: i32.const	$push0=, 24
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 24
+; CHECK: i32.const	$push2=, 64
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 16($[[SP]])
 ; CHECK: i64.load	$push4=, 56($[[SP]])
 ; CHECK: i64.store	16($0), $pop4
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i64.store	8($0), $[[L2]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L1]]
+; CHECK: i64.store	8($0), $[[L1]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
   %r3 = extractvalue { i64, i128, i192, i128, i64 } %t0, 3
@@ -129,19 +127,17 @@ define { i64, i192 } @test7() {
 ; CHECK: i32.const	$push0=, 40
 ; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
+; CHECK: i64.load	$[[L2:[0-9]+]]=, 8($[[SP]])
+; CHECK: i64.load	$[[L3:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i32.const	$push2=, 48
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
-; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i64.load	$[[L3:[0-9]+]]=, 8($[[SP]])
-; CHECK: i64.load	$push4=, 32($[[SP]])
-; CHECK: i64.store	8($0), $pop4
-; CHECK: i64.store	0($0), $[[L3]]
-; CHECK: i32.const	$push5=, 24
-; CHECK: i32.add 	$push6=, $0, $pop5
-; CHECK: i64.store	0($pop6), $[[L2]]
-; CHECK: i32.const	$push7=, 16
-; CHECK: i32.add 	$push8=, $0, $pop7
-; CHECK: i64.store	0($pop8), $[[L1]]
+; CHECK: i64.load	$push4=, 0($pop3)
+; CHECK: i64.store	24($0), $pop4
+; CHECK: i64.store	8($0), $[[L3]]
+; CHECK: i64.store	16($0), $[[L1]]
+; CHECK: i64.store	0($0), $[[L2]]
+; CHECK: i32.const	$push7=, 80
+; CHECK: i32.add 	$push8=, $4, $pop7
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r2 = extractvalue { i64, i128, i192, i128, i64 } %t0, 2
@@ -153,18 +149,16 @@ define { i64, i192 } @test7() {
 define { i128, i192, i128, i64 } @test8() {
 ; CHECK-LABEL: test8:
 ; CHECK: call    	return_multi_multi
-; CHECK: i32.const	$push18=, 8
-; CHECK: i32.add 	$push19=, $[[SP:[0-9]+]], $pop18
-; CHECK: i32.const	$push0=, 32
-; CHECK: i32.add 	$push1=, $pop19, $pop0
+; CHECK: i32.const	$push0=, 64
+; CHECK: i32.add 	$push1=, $[[SP:[0-9]+]], $pop0
 ; CHECK: i64.load	$[[L1:[0-9]+]]=, 0($pop1)
-; CHECK: i32.const	$push2=, 48
+; CHECK: i32.const	$push2=, 40
 ; CHECK: i32.add 	$push3=, $[[SP]], $pop2
 ; CHECK: i64.load	$[[L2:[0-9]+]]=, 0($pop3)
-; CHECK: i32.const	$push4=, 24
+; CHECK: i32.const	$push4=, 48
 ; CHECK: i32.add 	$push5=, $[[SP]], $pop4
 ; CHECK: i64.load	$[[L3:[0-9]+]]=, 0($pop5)
-; CHECK: i32.const	$push6=, 64
+; CHECK: i32.const	$push6=, 24
 ; CHECK: i32.add 	$push7=, $[[SP]], $pop6
 ; CHECK: i64.load	$[[L4:[0-9]+]]=, 0($pop7)
 ; CHECK: i64.load	$[[L5:[0-9]+]]=, 8($[[SP]])
@@ -172,19 +166,15 @@ define { i128, i192, i128, i64 } @test8() {
 ; CHECK: i64.load	$[[L7:[0-9]+]]=, 32($[[SP]])
 ; CHECK: i64.load	$push8=, 16($[[SP]])
 ; CHECK: i64.store	40($0), $pop8
+; CHECK: i64.store	48($0), $[[L4]]
+; CHECK: i64.store	32($0), $[[L3]]
 ; CHECK: i64.store	16($0), $[[L7]]
+; CHECK: i64.store	24($0), $[[L2]]
 ; CHECK: i64.store	0($0), $[[L6]]
-; CHECK: i64.store	8($0), $[[L4]]
+; CHECK: i64.store	8($0), $[[L1]]
 ; CHECK: i64.store	56($0), $[[L5]]
-; CHECK: i32.const	$push9=, 48
-; CHECK: i32.add 	$push10=, $0, $pop9
-; CHECK: i64.store	0($pop10), $[[L3]]
-; CHECK: i32.const	$push22=, 32
-; CHECK: i32.add 	$push11=, $0, $pop22
-; CHECK: i64.store	0($pop11), $[[L2]]
-; CHECK: i32.const	$push12=, 24
-; CHECK: i32.add 	$push13=, $0, $pop12
-; CHECK: i64.store	0($pop13), $[[L1]]
+; CHECK: i32.const	$push11=, 80
+; CHECK: i32.add 	$push12=, $8, $pop11
   %t0 = call { i64, i128, i192, i128, i64 } @return_multi_multi()
   %r0 = extractvalue { i64, i128, i192, i128, i64 } %t0, 0
   %r1 = extractvalue { i64, i128, i192, i128, i64 } %t0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 3a806b9..761a754 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -31,60 +31,38 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: add_v16i8:
 ; NO-SIMD128:         .functype add_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.add $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.add $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.add $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.add $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.add $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.add $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.add $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.add $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.add $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.add $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.add $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.add $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.add $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.add $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.add $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.add $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v16i8:
@@ -96,54 +74,32 @@ define <16 x i8> @add_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.add $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -165,60 +121,38 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: sub_v16i8:
 ; NO-SIMD128:         .functype sub_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.sub $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.sub $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.sub $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.sub $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.sub $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.sub $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.sub $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.sub $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.sub $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.sub $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.sub $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.sub $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.sub $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.sub $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.sub $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.sub $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v16i8:
@@ -230,54 +164,32 @@ define <16 x i8> @sub_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.sub $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.sub $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.sub $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.sub $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -425,60 +337,38 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: mul_v16i8:
 ; NO-SIMD128:         .functype mul_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.mul $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.mul $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.mul $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.mul $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.mul $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.mul $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.mul $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.mul $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.mul $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.mul $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.mul $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.mul $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.mul $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.mul $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.mul $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.mul $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.mul $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v16i8:
@@ -490,54 +380,32 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.mul $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.mul $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -559,108 +427,86 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_s_v16i8:
 ; NO-SIMD128:         .functype min_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.lt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.lt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.lt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.lt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.lt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.lt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.lt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.lt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.lt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.lt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.lt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v16i8:
@@ -681,93 +527,71 @@ define <16 x i8> @min_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -790,140 +614,118 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: min_u_v16i8:
 ; NO-SIMD128:         .functype min_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.lt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.lt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.lt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.lt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.lt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.lt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.lt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.lt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.lt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.lt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v16i8:
@@ -931,138 +733,116 @@ define <16 x i8> @min_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1085,108 +865,86 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_s_v16i8:
 ; NO-SIMD128:         .functype max_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $16, $32, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $31
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $15, $31, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 13
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $30
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $31
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $15, $31, $pop6
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $30
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $14, $30, $pop10
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $29
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $14, $30, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $29
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $13, $29, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push28=, 11
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $28
+; NO-SIMD128-NEXT:    i32.select $push15=, $13, $29, $pop14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $28
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $12, $28, $pop18
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $27
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $11, $27, $pop22
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push24=, $26
 ; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
-; NO-SIMD128-NEXT:    i32.select $push27=, $12, $28, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push34=, 10
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push30=, $27
-; NO-SIMD128-NEXT:    i32.gt_s $push32=, $pop31, $pop30
-; NO-SIMD128-NEXT:    i32.select $push33=, $11, $27, $pop32
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push40=, 9
-; NO-SIMD128-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-NEXT:    i32.select $push27=, $10, $26, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $25
+; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.select $push31=, $9, $25, $pop30
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop31
+; NO-SIMD128-NEXT:    i32.extend8_s $push33=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $24
+; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-NEXT:    i32.select $push35=, $8, $24, $pop34
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop35
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $23
 ; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $10, $26, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop41), $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $25
-; NO-SIMD128-NEXT:    i32.gt_s $push44=, $pop43, $pop42
-; NO-SIMD128-NEXT:    i32.select $push45=, $9, $25, $pop44
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-NEXT:    i32.const $push50=, 7
-; NO-SIMD128-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $24
-; NO-SIMD128-NEXT:    i32.gt_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.select $push49=, $8, $24, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop51), $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 6
-; NO-SIMD128-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $23
+; NO-SIMD128-NEXT:    i32.select $push39=, $7, $23, $pop38
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $22
+; NO-SIMD128-NEXT:    i32.gt_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.select $push43=, $6, $22, $pop42
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop43
+; NO-SIMD128-NEXT:    i32.extend8_s $push45=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $21
+; NO-SIMD128-NEXT:    i32.gt_s $push46=, $pop45, $pop44
+; NO-SIMD128-NEXT:    i32.select $push47=, $5, $21, $pop46
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop47
+; NO-SIMD128-NEXT:    i32.extend8_s $push49=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $20
+; NO-SIMD128-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-NEXT:    i32.select $push51=, $4, $20, $pop50
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop51
+; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $19
 ; NO-SIMD128-NEXT:    i32.gt_s $push54=, $pop53, $pop52
-; NO-SIMD128-NEXT:    i32.select $push55=, $7, $23, $pop54
-; NO-SIMD128-NEXT:    i32.store8 0($pop57), $pop55
-; NO-SIMD128-NEXT:    i32.const $push62=, 5
-; NO-SIMD128-NEXT:    i32.add $push63=, $0, $pop62
-; NO-SIMD128-NEXT:    i32.extend8_s $push59=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push58=, $22
-; NO-SIMD128-NEXT:    i32.gt_s $push60=, $pop59, $pop58
-; NO-SIMD128-NEXT:    i32.select $push61=, $6, $22, $pop60
-; NO-SIMD128-NEXT:    i32.store8 0($pop63), $pop61
-; NO-SIMD128-NEXT:    i32.extend8_s $push65=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $21
-; NO-SIMD128-NEXT:    i32.gt_s $push66=, $pop65, $pop64
-; NO-SIMD128-NEXT:    i32.select $push67=, $5, $21, $pop66
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop67
-; NO-SIMD128-NEXT:    i32.const $push72=, 3
-; NO-SIMD128-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-NEXT:    i32.extend8_s $push69=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push68=, $20
-; NO-SIMD128-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-NEXT:    i32.select $push71=, $4, $20, $pop70
-; NO-SIMD128-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-NEXT:    i32.extend8_s $push75=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push74=, $19
-; NO-SIMD128-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-NEXT:    i32.select $push77=, $3, $19, $pop76
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop77
-; NO-SIMD128-NEXT:    i32.extend8_s $push79=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push78=, $18
-; NO-SIMD128-NEXT:    i32.gt_s $push80=, $pop79, $pop78
-; NO-SIMD128-NEXT:    i32.select $push81=, $2, $18, $pop80
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop81
-; NO-SIMD128-NEXT:    i32.extend8_s $push83=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push82=, $17
-; NO-SIMD128-NEXT:    i32.gt_s $push84=, $pop83, $pop82
-; NO-SIMD128-NEXT:    i32.select $push85=, $1, $17, $pop84
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop85
+; NO-SIMD128-NEXT:    i32.select $push55=, $3, $19, $pop54
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop55
+; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $18
+; NO-SIMD128-NEXT:    i32.gt_s $push58=, $pop57, $pop56
+; NO-SIMD128-NEXT:    i32.select $push59=, $2, $18, $pop58
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop59
+; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push60=, $17
+; NO-SIMD128-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-NEXT:    i32.select $push63=, $1, $17, $pop62
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop63
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v16i8:
@@ -1207,93 +965,71 @@ define <16 x i8> @max_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $21
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $21
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $22
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $24
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $23, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $24
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $25
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $9
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $25
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push34=, $pop33, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.select $push35=, $9, $25, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push36=, $26
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push38=, $pop37, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.select $push39=, $10, $26, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push42=, $pop41, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $9, $25, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $26
+; NO-SIMD128-FAST-NEXT:    i32.select $push43=, $11, $27, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop43
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $28
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push46=, $pop45, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $10, $26, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push51=, $11
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $27
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push52=, $pop51, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.select $push53=, $11, $27, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $28
+; NO-SIMD128-FAST-NEXT:    i32.select $push47=, $12, $28, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push48=, $29
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push50=, $pop49, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.select $push51=, $13, $29, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop51
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push52=, $30
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push54=, $pop53, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.select $push55=, $14, $30, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop55
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push57=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push56=, $31
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $12, $28, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push63=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push62=, $29
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push64=, $pop63, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.select $push65=, $13, $29, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $0, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push68=, $30
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.select $push71=, $14, $30, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop73), $pop71
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $0, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push75=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push74=, $31
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push76=, $pop75, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.select $push77=, $15, $31, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop79), $pop77
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $0, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push81=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push80=, $32
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push82=, $pop81, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.select $push83=, $16, $32, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop85), $pop83
+; NO-SIMD128-FAST-NEXT:    i32.select $push59=, $15, $31, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop59
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push61=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $32
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push62=, $pop61, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.select $push63=, $16, $32, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop63
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1316,140 +1052,118 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: max_u_v16i8:
 ; NO-SIMD128:         .functype max_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
-; NO-SIMD128-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop117
+; NO-SIMD128-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop95
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $16, $32, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $31, $pop115
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $15, $31, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $14, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $30, $pop113
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $14, $30, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-NEXT:    i32.and $push20=, $13, $pop112
-; NO-SIMD128-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $29, $pop111
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $13, $29, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $12, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-NEXT:    i32.and $push25=, $28, $pop109
-; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.select $push28=, $12, $28, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop107
-; NO-SIMD128-NEXT:    i32.gt_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.select $push34=, $11, $27, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-NEXT:    i32.and $push38=, $10, $pop106
-; NO-SIMD128-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $26, $pop105
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $10, $26, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-NEXT:    i32.and $push44=, $9, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $25, $pop103
-; NO-SIMD128-NEXT:    i32.gt_u $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.select $push46=, $9, $25, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-NEXT:    i32.and $push48=, $8, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $24, $pop101
-; NO-SIMD128-NEXT:    i32.gt_u $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.select $push50=, $8, $24, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push54=, $7, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push53=, $23, $pop99
-; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.select $push56=, $7, $23, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push60=, $6, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push59=, $22, $pop97
-; NO-SIMD128-NEXT:    i32.gt_u $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.select $push62=, $6, $22, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $5, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push65=, $21, $pop95
-; NO-SIMD128-NEXT:    i32.gt_u $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.select $push68=, $5, $21, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push70=, $4, $pop94
+; NO-SIMD128-NEXT:    i32.and $push6=, $15, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push69=, $20, $pop93
-; NO-SIMD128-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.select $push72=, $4, $20, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
+; NO-SIMD128-NEXT:    i32.and $push5=, $31, $pop93
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $15, $31, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push76=, $3, $pop92
+; NO-SIMD128-NEXT:    i32.and $push10=, $14, $pop92
 ; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push75=, $19, $pop91
-; NO-SIMD128-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.select $push78=, $3, $19, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
+; NO-SIMD128-NEXT:    i32.and $push9=, $30, $pop91
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $14, $30, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push80=, $2, $pop90
+; NO-SIMD128-NEXT:    i32.and $push14=, $13, $pop90
 ; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push79=, $18, $pop89
-; NO-SIMD128-NEXT:    i32.gt_u $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.select $push82=, $2, $18, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
+; NO-SIMD128-NEXT:    i32.and $push13=, $29, $pop89
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $13, $29, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
 ; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push84=, $1, $pop88
+; NO-SIMD128-NEXT:    i32.and $push18=, $12, $pop88
 ; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push83=, $17, $pop87
-; NO-SIMD128-NEXT:    i32.gt_u $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.select $push86=, $1, $17, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.and $push17=, $28, $pop87
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $12, $28, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $11, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $27, $pop85
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $11, $27, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $10, $pop84
+; NO-SIMD128-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $26, $pop83
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $10, $26, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-NEXT:    i32.and $push30=, $9, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $25, $pop81
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $9, $25, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $8, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-NEXT:    i32.and $push33=, $24, $pop79
+; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.select $push36=, $8, $24, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $7, $pop78
+; NO-SIMD128-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $23, $pop77
+; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.select $push40=, $7, $23, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-NEXT:    i32.and $push42=, $6, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $22, $pop75
+; NO-SIMD128-NEXT:    i32.gt_u $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.select $push44=, $6, $22, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $5, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-NEXT:    i32.and $push45=, $21, $pop73
+; NO-SIMD128-NEXT:    i32.gt_u $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.select $push48=, $5, $21, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-NEXT:    i32.and $push50=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $20, $pop71
+; NO-SIMD128-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.select $push52=, $4, $20, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push54=, $3, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push53=, $19, $pop69
+; NO-SIMD128-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-NEXT:    i32.select $push56=, $3, $19, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push58=, $2, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push57=, $18, $pop67
+; NO-SIMD128-NEXT:    i32.gt_u $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.select $push60=, $2, $18, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push62=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push61=, $17, $pop65
+; NO-SIMD128-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.select $push64=, $1, $17, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v16i8:
@@ -1457,138 +1171,116 @@ define <16 x i8> @max_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop117
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop95
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $17, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop115
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $18, $pop93
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $18, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $19, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $19, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop111
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $20, $pop89
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $20, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $21, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $22, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $23, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $21, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $21, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $22, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $22, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $24, $pop81
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $23, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $24, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $24, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $9, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $25, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $24, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.select $push36=, $9, $25, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $10, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $26, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push40=, $10, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop75
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $9, $25, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $10, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $26, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.select $push44=, $11, $27, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $12, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $28, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $10, $26, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $11, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $27, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.select $push54=, $11, $27, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $12, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $28, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.select $push48=, $12, $28, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $13, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $29, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.select $push52=, $13, $29, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $14, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $30, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.select $push56=, $14, $30, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $15, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $31, $pop67
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $12, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $13, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $29, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.select $push66=, $13, $29, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push70=, $14, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $30, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.select $push72=, $14, $30, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push76=, $15, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $31, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.select $push78=, $15, $31, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push82=, $16, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $32, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.select $push84=, $16, $32, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.select $push60=, $15, $31, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $16, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $32, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.select $push64=, $16, $32, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <16 x i8> %x, %y
   %a = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
@@ -1611,156 +1303,134 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8:
 ; NO-SIMD128:         .functype avgr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8:
@@ -1771,151 +1441,129 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <16 x i8> %x, %y
   %b = add nuw <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -1949,156 +1597,134 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v16i8_wrap:
 ; NO-SIMD128:         .functype avgr_u_v16i8_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 254
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop133
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop132
-; NO-SIMD128-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop131
-; NO-SIMD128-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop130
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop129
-; NO-SIMD128-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop128
-; NO-SIMD128-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop127
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.add $push22=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop126
-; NO-SIMD128-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop125
-; NO-SIMD128-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop124
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $pop25
-; NO-SIMD128-NEXT:    i32.const $push26=, 11
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.add $push28=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-NEXT:    i32.add $push29=, $pop28, $pop123
-; NO-SIMD128-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $pop122
-; NO-SIMD128-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push31=, $pop30, $pop121
-; NO-SIMD128-NEXT:    i32.store8 0($pop27), $pop31
-; NO-SIMD128-NEXT:    i32.const $push32=, 10
-; NO-SIMD128-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-NEXT:    i32.add $push34=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop120
-; NO-SIMD128-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop119
-; NO-SIMD128-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop118
-; NO-SIMD128-NEXT:    i32.store8 0($pop33), $pop37
-; NO-SIMD128-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-NEXT:    i32.add $push40=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-NEXT:    i32.add $push41=, $pop40, $pop117
-; NO-SIMD128-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-NEXT:    i32.and $push42=, $pop41, $pop116
-; NO-SIMD128-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop115
-; NO-SIMD128-NEXT:    i32.store8 0($pop39), $pop43
-; NO-SIMD128-NEXT:    i32.add $push44=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-NEXT:    i32.add $push45=, $pop44, $pop114
-; NO-SIMD128-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $pop113
-; NO-SIMD128-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push47=, $pop46, $pop112
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop47
-; NO-SIMD128-NEXT:    i32.const $push48=, 7
-; NO-SIMD128-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-NEXT:    i32.add $push50=, $8, $24
+; NO-SIMD128-NEXT:    i32.add $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 254
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop111
-; NO-SIMD128-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop110
-; NO-SIMD128-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop109
-; NO-SIMD128-NEXT:    i32.store8 0($pop49), $pop53
-; NO-SIMD128-NEXT:    i32.const $push54=, 6
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.add $push56=, $7, $23
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop111
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop109
 ; NO-SIMD128-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-NEXT:    i32.add $push57=, $pop56, $pop108
-; NO-SIMD128-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-NEXT:    i32.and $push58=, $pop57, $pop107
-; NO-SIMD128-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push59=, $pop58, $pop106
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop59
-; NO-SIMD128-NEXT:    i32.const $push60=, 5
-; NO-SIMD128-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-NEXT:    i32.add $push62=, $6, $22
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop108
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop105
-; NO-SIMD128-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop104
-; NO-SIMD128-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop103
-; NO-SIMD128-NEXT:    i32.store8 0($pop61), $pop65
-; NO-SIMD128-NEXT:    i32.add $push66=, $5, $21
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-NEXT:    i32.add $push67=, $pop66, $pop102
-; NO-SIMD128-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $pop101
-; NO-SIMD128-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push69=, $pop68, $pop100
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop69
-; NO-SIMD128-NEXT:    i32.const $push70=, 3
-; NO-SIMD128-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-NEXT:    i32.add $push72=, $4, $20
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-NEXT:    i32.add $push73=, $pop72, $pop99
-; NO-SIMD128-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-NEXT:    i32.and $push74=, $pop73, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push75=, $pop74, $pop97
-; NO-SIMD128-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-NEXT:    i32.add $push76=, $3, $19
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-NEXT:    i32.add $push77=, $pop76, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-NEXT:    i32.and $push78=, $pop77, $pop95
-; NO-SIMD128-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push79=, $pop78, $pop94
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop79
-; NO-SIMD128-NEXT:    i32.add $push80=, $2, $18
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-NEXT:    i32.add $push81=, $pop80, $pop93
-; NO-SIMD128-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-NEXT:    i32.and $push82=, $pop81, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push83=, $pop82, $pop91
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop83
-; NO-SIMD128-NEXT:    i32.add $push84=, $1, $17
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop87
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop33
+; NO-SIMD128-NEXT:    i32.add $push34=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop37
+; NO-SIMD128-NEXT:    i32.add $push38=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push42=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop45
+; NO-SIMD128-NEXT:    i32.add $push46=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop49
+; NO-SIMD128-NEXT:    i32.add $push50=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop53
+; NO-SIMD128-NEXT:    i32.add $push54=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop57
+; NO-SIMD128-NEXT:    i32.add $push58=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop61
+; NO-SIMD128-NEXT:    i32.add $push62=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop65
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v16i8_wrap:
@@ -2109,151 +1735,129 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 254
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push133=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop133
+; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop111
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push132=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop132
-; NO-SIMD128-FAST-NEXT:    i32.const $push131=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop131
-; NO-SIMD128-FAST-NEXT:    i32.const $push130=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop130
+; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop109
+; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop108
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push129=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop129
-; NO-SIMD128-FAST-NEXT:    i32.const $push128=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop128
-; NO-SIMD128-FAST-NEXT:    i32.const $push127=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop127
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push126=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop126
-; NO-SIMD128-FAST-NEXT:    i32.const $push125=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop125
-; NO-SIMD128-FAST-NEXT:    i32.const $push124=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop124
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push123=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop123
-; NO-SIMD128-FAST-NEXT:    i32.const $push122=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop122
-; NO-SIMD128-FAST-NEXT:    i32.const $push121=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop121
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push120=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop120
-; NO-SIMD128-FAST-NEXT:    i32.const $push119=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop119
-; NO-SIMD128-FAST-NEXT:    i32.const $push118=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop118
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push117=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.const $push116=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop116
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop115
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push114=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.const $push113=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop113
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop112
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push111=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.const $push110=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop110
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop109
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push108=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $pop48, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $pop49, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push51=, $pop50, $pop106
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push53=, $0, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push107=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop107
+; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop106
 ; NO-SIMD128-FAST-NEXT:    i32.const $push105=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop103
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop53), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop105
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push104=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop104
+; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop103
 ; NO-SIMD128-FAST-NEXT:    i32.const $push102=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $pop60, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $pop61, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop102
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop100
 ; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $pop66, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push69=, $pop68, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop69
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push71=, $0, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.add $push72=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop98
+; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop97
 ; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push73=, $pop72, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push74=, $pop73, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push75=, $pop74, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop71), $pop75
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push77=, $0, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.add $push78=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push79=, $pop78, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push80=, $pop79, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push81=, $pop80, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop77), $pop81
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push83=, $0, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.add $push84=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push85=, $pop84, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 254
-; NO-SIMD128-FAST-NEXT:    i32.and $push86=, $pop85, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push87=, $pop86, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop83), $pop87
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop90
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push35=, $pop34, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $pop35, $pop88
+; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push37=, $pop36, $pop87
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push43=, $pop42, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $pop43, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop81
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $pop46, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push49=, $pop48, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $pop50, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $pop51, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop53
+; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $pop54, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $pop55, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push57=, $pop56, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop57
+; NO-SIMD128-FAST-NEXT:    i32.add $push58=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $pop58, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push60=, $pop59, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push61=, $pop60, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop61
+; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push63=, $pop62, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 254
+; NO-SIMD128-FAST-NEXT:    i32.and $push64=, $pop63, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push65=, $pop64, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop65
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <16 x i8> %x, %y
   %b = add <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
@@ -2279,140 +1883,118 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-LABEL: abs_v16i8:
 ; NO-SIMD128:         .functype abs_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 15
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop116
+; NO-SIMD128-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop94
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push114=, $pop6, $pop115
-; NO-SIMD128-NEXT:    local.tee $push113=, $16=, $pop114
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop113
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 13
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $14
-; NO-SIMD128-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push111=, $pop11, $pop112
-; NO-SIMD128-NEXT:    local.tee $push110=, $16=, $pop111
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop110
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $13
-; NO-SIMD128-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push108=, $pop16, $pop109
-; NO-SIMD128-NEXT:    local.tee $push107=, $16=, $pop108
-; NO-SIMD128-NEXT:    i32.xor $push17=, $13, $pop107
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $12
-; NO-SIMD128-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push105=, $pop21, $pop106
-; NO-SIMD128-NEXT:    local.tee $push104=, $16=, $pop105
-; NO-SIMD128-NEXT:    i32.xor $push22=, $12, $pop104
-; NO-SIMD128-NEXT:    i32.sub $push23=, $pop22, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 10
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $11
-; NO-SIMD128-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push102=, $pop26, $pop103
-; NO-SIMD128-NEXT:    local.tee $push101=, $16=, $pop102
-; NO-SIMD128-NEXT:    i32.xor $push27=, $11, $pop101
-; NO-SIMD128-NEXT:    i32.sub $push28=, $pop27, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 9
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $10
-; NO-SIMD128-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push99=, $pop31, $pop100
-; NO-SIMD128-NEXT:    local.tee $push98=, $16=, $pop99
-; NO-SIMD128-NEXT:    i32.xor $push32=, $10, $pop98
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push96=, $pop36, $pop97
-; NO-SIMD128-NEXT:    local.tee $push95=, $16=, $pop96
-; NO-SIMD128-NEXT:    i32.xor $push37=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.sub $push38=, $pop37, $16
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop38
-; NO-SIMD128-NEXT:    i32.const $push94=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop94
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
 ; NO-SIMD128-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop39, $pop93
+; NO-SIMD128-NEXT:    i32.shr_s $push92=, $pop4, $pop93
 ; NO-SIMD128-NEXT:    local.tee $push91=, $16=, $pop92
-; NO-SIMD128-NEXT:    i32.xor $push40=, $8, $pop91
-; NO-SIMD128-NEXT:    i32.sub $push41=, $pop40, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop41
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop91
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
 ; NO-SIMD128-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop43, $pop90
+; NO-SIMD128-NEXT:    i32.shr_s $push89=, $pop7, $pop90
 ; NO-SIMD128-NEXT:    local.tee $push88=, $16=, $pop89
-; NO-SIMD128-NEXT:    i32.xor $push44=, $7, $pop88
-; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.extend8_s $push48=, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $14, $pop88
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $16
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
 ; NO-SIMD128-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop48, $pop87
+; NO-SIMD128-NEXT:    i32.shr_s $push86=, $pop10, $pop87
 ; NO-SIMD128-NEXT:    local.tee $push85=, $16=, $pop86
-; NO-SIMD128-NEXT:    i32.xor $push49=, $6, $pop85
-; NO-SIMD128-NEXT:    i32.sub $push50=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.extend8_s $push53=, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $13, $pop85
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $16
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
 ; NO-SIMD128-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop53, $pop84
+; NO-SIMD128-NEXT:    i32.shr_s $push83=, $pop13, $pop84
 ; NO-SIMD128-NEXT:    local.tee $push82=, $16=, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push54=, $5, $pop82
-; NO-SIMD128-NEXT:    i32.sub $push55=, $pop54, $16
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push56=, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $12, $pop82
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
 ; NO-SIMD128-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop56, $pop81
+; NO-SIMD128-NEXT:    i32.shr_s $push80=, $pop16, $pop81
 ; NO-SIMD128-NEXT:    local.tee $push79=, $16=, $pop80
-; NO-SIMD128-NEXT:    i32.xor $push57=, $4, $pop79
-; NO-SIMD128-NEXT:    i32.sub $push58=, $pop57, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.extend8_s $push61=, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $11, $pop79
+; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop61, $pop78
+; NO-SIMD128-NEXT:    i32.shr_s $push77=, $pop19, $pop78
 ; NO-SIMD128-NEXT:    local.tee $push76=, $16=, $pop77
-; NO-SIMD128-NEXT:    i32.xor $push62=, $3, $pop76
-; NO-SIMD128-NEXT:    i32.sub $push63=, $pop62, $16
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.extend8_s $push64=, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $10, $pop76
+; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $16
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
 ; NO-SIMD128-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop64, $pop75
+; NO-SIMD128-NEXT:    i32.shr_s $push74=, $pop22, $pop75
 ; NO-SIMD128-NEXT:    local.tee $push73=, $16=, $pop74
-; NO-SIMD128-NEXT:    i32.xor $push65=, $2, $pop73
-; NO-SIMD128-NEXT:    i32.sub $push66=, $pop65, $16
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $16
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $8
 ; NO-SIMD128-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop67, $pop72
+; NO-SIMD128-NEXT:    i32.shr_s $push71=, $pop25, $pop72
 ; NO-SIMD128-NEXT:    local.tee $push70=, $16=, $pop71
-; NO-SIMD128-NEXT:    i32.xor $push68=, $1, $pop70
-; NO-SIMD128-NEXT:    i32.sub $push69=, $pop68, $16
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $pop70
+; NO-SIMD128-NEXT:    i32.sub $push27=, $pop26, $16
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $7
+; NO-SIMD128-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-NEXT:    local.tee $push67=, $16=, $pop68
+; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $pop67
+; NO-SIMD128-NEXT:    i32.sub $push30=, $pop29, $16
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $6
+; NO-SIMD128-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-NEXT:    local.tee $push64=, $16=, $pop65
+; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $pop64
+; NO-SIMD128-NEXT:    i32.sub $push33=, $pop32, $16
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push34=, $5
+; NO-SIMD128-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-NEXT:    local.tee $push61=, $16=, $pop62
+; NO-SIMD128-NEXT:    i32.xor $push35=, $5, $pop61
+; NO-SIMD128-NEXT:    i32.sub $push36=, $pop35, $16
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $4
+; NO-SIMD128-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-NEXT:    local.tee $push58=, $16=, $pop59
+; NO-SIMD128-NEXT:    i32.xor $push38=, $4, $pop58
+; NO-SIMD128-NEXT:    i32.sub $push39=, $pop38, $16
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push40=, $3
+; NO-SIMD128-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-NEXT:    local.tee $push55=, $16=, $pop56
+; NO-SIMD128-NEXT:    i32.xor $push41=, $3, $pop55
+; NO-SIMD128-NEXT:    i32.sub $push42=, $pop41, $16
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $2
+; NO-SIMD128-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-NEXT:    local.tee $push52=, $16=, $pop53
+; NO-SIMD128-NEXT:    i32.xor $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.sub $push45=, $pop44, $16
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push46=, $1
+; NO-SIMD128-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-NEXT:    local.tee $push49=, $16=, $pop50
+; NO-SIMD128-NEXT:    i32.xor $push47=, $1, $pop49
+; NO-SIMD128-NEXT:    i32.sub $push48=, $pop47, $16
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v16i8:
@@ -2420,138 +2002,116 @@ define <16 x i8> @abs_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push117=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push116=, $17=, $pop117
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop116
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $17=, $pop95
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop94
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push115=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push114=, $pop4, $pop115
-; NO-SIMD128-FAST-NEXT:    local.tee $push113=, $1=, $pop114
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop113
+; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop4, $pop93
+; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $1=, $pop92
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop91
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push112=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push111=, $pop7, $pop112
-; NO-SIMD128-FAST-NEXT:    local.tee $push110=, $2=, $pop111
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop110
+; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop7, $pop90
+; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $2=, $pop89
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop88
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push109=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push108=, $pop10, $pop109
-; NO-SIMD128-FAST-NEXT:    local.tee $push107=, $3=, $pop108
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop107
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push106=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push105=, $pop15, $pop106
-; NO-SIMD128-FAST-NEXT:    local.tee $push104=, $4=, $pop105
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop104
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push103=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push102=, $pop18, $pop103
-; NO-SIMD128-FAST-NEXT:    local.tee $push101=, $5=, $pop102
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop101
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push99=, $pop23, $pop100
-; NO-SIMD128-FAST-NEXT:    local.tee $push98=, $6=, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push95=, $pop28, $pop96
-; NO-SIMD128-FAST-NEXT:    local.tee $push94=, $7=, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push92=, $pop32, $pop93
-; NO-SIMD128-FAST-NEXT:    local.tee $push91=, $8=, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.sub $push34=, $pop33, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push89=, $pop35, $pop90
-; NO-SIMD128-FAST-NEXT:    local.tee $push88=, $9=, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $10, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.sub $push37=, $pop36, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop39), $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop40, $pop87
-; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $10=, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push49=, $0, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $12
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push86=, $pop10, $pop87
+; NO-SIMD128-FAST-NEXT:    local.tee $push85=, $3=, $pop86
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop45, $pop84
-; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $11=, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.sub $push47=, $pop46, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop49), $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push50=, $13
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push83=, $pop13, $pop84
+; NO-SIMD128-FAST-NEXT:    local.tee $push82=, $4=, $pop83
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop82
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop50, $pop81
-; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $12=, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $13, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.sub $push52=, $pop51, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push59=, $0, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push55=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push80=, $pop16, $pop81
+; NO-SIMD128-FAST-NEXT:    local.tee $push79=, $5=, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop55, $pop78
-; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $13=, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $14, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.sub $push57=, $pop56, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop59), $pop57
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push60=, $15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push77=, $pop19, $pop78
+; NO-SIMD128-FAST-NEXT:    local.tee $push76=, $6=, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop60, $pop75
-; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $14=, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $15, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.sub $push62=, $pop61, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push69=, $0, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push65=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push74=, $pop22, $pop75
+; NO-SIMD128-FAST-NEXT:    local.tee $push73=, $7=, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop65, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $0=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $16, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.sub $push67=, $pop66, $0
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop69), $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push71=, $pop25, $pop72
+; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $8=, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $9, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.sub $push27=, $pop26, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push28=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push68=, $pop28, $pop69
+; NO-SIMD128-FAST-NEXT:    local.tee $push67=, $9=, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $10, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop31, $pop66
+; NO-SIMD128-FAST-NEXT:    local.tee $push64=, $10=, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $11, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.sub $push33=, $pop32, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push62=, $pop34, $pop63
+; NO-SIMD128-FAST-NEXT:    local.tee $push61=, $11=, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $12, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.sub $push36=, $pop35, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push59=, $pop37, $pop60
+; NO-SIMD128-FAST-NEXT:    local.tee $push58=, $12=, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $13, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.sub $push39=, $pop38, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push40=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push56=, $pop40, $pop57
+; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $13=, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $14, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.sub $push42=, $pop41, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push43=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push53=, $pop43, $pop54
+; NO-SIMD128-FAST-NEXT:    local.tee $push52=, $14=, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.sub $push45=, $pop44, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push46=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop46, $pop51
+; NO-SIMD128-FAST-NEXT:    local.tee $push49=, $15=, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $16, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.sub $push48=, $pop47, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> zeroinitializer, %x
   %b = icmp slt <16 x i8> %x, zeroinitializer
@@ -2576,75 +2136,53 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype neg_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $9
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop52, $3
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop51, $2
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop50, $1
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-NEXT:    i32.sub $push6=, $pop49, $16
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-NEXT:    i32.sub $push9=, $pop48, $15
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-NEXT:    i32.sub $push12=, $pop47, $14
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-NEXT:    i32.sub $push15=, $pop46, $13
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-NEXT:    i32.sub $push18=, $pop45, $12
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-NEXT:    i32.sub $push21=, $pop44, $11
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-NEXT:    i32.sub $push24=, $pop43, $10
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-NEXT:    i32.sub $push27=, $pop42, $8
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-NEXT:    i32.sub $push30=, $pop41, $7
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-NEXT:    i32.sub $push33=, $pop40, $6
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-NEXT:    i32.sub $push36=, $pop39, $4
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop31, $15
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop30, $14
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop29, $13
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop28, $12
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop27, $11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop26, $10
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop25, $9
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-NEXT:    i32.sub $push10=, $pop23, $7
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-NEXT:    i32.sub $push11=, $pop22, $6
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop21, $5
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-NEXT:    i32.sub $push13=, $pop20, $4
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-NEXT:    i32.sub $push14=, $pop19, $3
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-NEXT:    i32.sub $push16=, $pop17, $1
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v16i8:
@@ -2653,73 +2191,51 @@ define <16 x i8> @neg_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop53, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop31, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop52, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop30, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop51, $4
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop50, $5
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop49, $6
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop48, $7
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop47, $8
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop46, $9
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop45, $10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push23=, $pop44, $11
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push26=, $pop43, $12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push29=, $pop42, $13
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push32=, $pop41, $14
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push35=, $pop40, $15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push38=, $pop39, $16
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop29, $4
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop28, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop27, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop26, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop25, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop23, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push11=, $pop22, $11
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop21, $12
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop20, $13
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push14=, $pop19, $14
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop18, $15
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $16
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0,
                       i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>,
@@ -2744,124 +2260,80 @@ define <16 x i8> @shl_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.shl $push36=, $4, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop17
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $17
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $17
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $17
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $17
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $17
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $17
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $17
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $17
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v16i8:
 ; NO-SIMD128-FAST:         .functype shl_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $17=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $17=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.shl $push17=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.shl $push23=, $11, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $12, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push29=, $13, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $14, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shl $push35=, $15, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $16, $17
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -2890,75 +2362,53 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128:         .functype shl_const_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.shl $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-NEXT:    i32.shl $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-NEXT:    i32.shl $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-NEXT:    i32.shl $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-NEXT:    i32.shl $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-NEXT:    i32.shl $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-NEXT:    i32.shl $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-NEXT:    i32.shl $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-NEXT:    i32.shl $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop40
-; NO-SIMD128-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-NEXT:    i32.shl $push33=, $6, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop33
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-NEXT:    i32.shl $push35=, $4, $pop38
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.shl $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-NEXT:    i32.shl $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-NEXT:    i32.shl $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-NEXT:    i32.shl $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-NEXT:    i32.shl $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-NEXT:    i32.shl $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-NEXT:    i32.shl $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v16i8:
@@ -2967,73 +2417,51 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $6, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $7, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $8, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $9, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push19=, $10, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push25=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push31=, $14, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $15, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push37=, $16, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v,
     <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5,
@@ -3248,91 +2676,69 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shl_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.shl $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.shl $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.shl $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.shl $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.shl $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.shl $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.shl $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.shl $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.shl $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.shl $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.shl $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.shl $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.shl $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.shl $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.shl $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.shl $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.shl $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.shl $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.shl $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.shl $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.shl $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.shl $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.shl $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.shl $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.shl $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.shl $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.shl $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.shl $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v16i8:
@@ -3342,88 +2748,66 @@ define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.shl $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shl $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shl $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shl $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shl $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shl $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.shl $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.shl $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.shl $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shl $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shl $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shl $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -3445,79 +2829,57 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: shr_s_v16i8:
 ; NO-SIMD128:         .functype shr_s_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-NEXT:    local.tee $push55=, $17=, $pop56
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop55
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $5
+; NO-SIMD128-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-NEXT:    local.tee $push33=, $17=, $pop34
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop33
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $14
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $13
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $12
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $16
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $11
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $15
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $10
+; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $9
 ; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $14
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $8
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $13
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $6
+; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $12
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push25=, $4
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $3
 ; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $11
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $2
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push31=, $1
 ; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $10
-; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.extend8_s $push39=, $8
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.extend8_s $push43=, $7
-; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.extend8_s $push51=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v16i8:
@@ -3525,9 +2887,9 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $17, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push55=, $1=, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $17, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $1=, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -3535,67 +2897,45 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $9
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $10
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $11
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $9
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $13
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push27=, $14
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push28=, $pop27, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push33=, $11
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop33, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop37, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $13
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push45=, $14
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop45, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push53=, $16
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push54=, $pop53, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $16
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -3811,108 +3151,86 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v16i8:
 ; NO-SIMD128:         .functype shr_s_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push2=, $16
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $5
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop85
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend8_s $push5=, $15
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop63
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend8_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop62
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $2
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop83
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend8_s $push11=, $13
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop61
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $1
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop82
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend8_s $push14=, $12
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop60
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $16
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop81
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $11
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop59
 ; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $15
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop80
-; NO-SIMD128-NEXT:    i32.shr_s $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $14
-; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop79
-; NO-SIMD128-NEXT:    i32.shr_s $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $13
-; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop78
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend8_s $push20=, $10
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop58
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $9
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop57
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.extend8_s $push26=, $8
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop56
+; NO-SIMD128-NEXT:    i32.shr_s $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend8_s $push29=, $7
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop55
+; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.extend8_s $push32=, $6
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop54
 ; NO-SIMD128-NEXT:    i32.shr_s $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.extend8_s $push37=, $12
-; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop77
-; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.extend8_s $push42=, $11
-; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop76
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $10
-; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop75
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.extend8_s $push35=, $5
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop53
+; NO-SIMD128-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.extend8_s $push38=, $4
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop52
+; NO-SIMD128-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.extend8_s $push41=, $3
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop51
+; NO-SIMD128-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.extend8_s $push44=, $2
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop50
+; NO-SIMD128-NEXT:    i32.shr_s $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.extend8_s $push47=, $1
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
 ; NO-SIMD128-NEXT:    i32.shr_s $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
-; NO-SIMD128-NEXT:    i32.extend8_s $push52=, $8
-; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop74
-; NO-SIMD128-NEXT:    i32.shr_s $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.extend8_s $push57=, $7
-; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop73
-; NO-SIMD128-NEXT:    i32.shr_s $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
-; NO-SIMD128-NEXT:    i32.extend8_s $push62=, $6
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop72
-; NO-SIMD128-NEXT:    i32.shr_s $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
-; NO-SIMD128-NEXT:    i32.extend8_s $push67=, $4
-; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_s $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v16i8:
@@ -3924,102 +3242,80 @@ define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop85
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop63
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop84
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop62
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $20, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $22, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $23, $pop80
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push26=, $9
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop56
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $24, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop29), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push34=, $9
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push39=, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $26, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop39, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $27, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $10
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push32=, $11
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push35=, $12
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push38=, $13
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push41=, $14
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push44=, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop50
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push45=, $pop44, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push49=, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $28, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push50=, $pop49, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop47), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push54=, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $29, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop54, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push57=, $0, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push59=, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $30, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push60=, $pop59, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop57), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push64=, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $31, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop65
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push67=, $0, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push69=, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push70=, $pop69, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop67), $pop70
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push47=, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4042,94 +3338,72 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128:         .functype shr_u_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-NEXT:    local.tee $push70=, $17=, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop70
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-NEXT:    i32.and $push3=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-NEXT:    local.tee $push48=, $17=, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop47
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $17
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop46
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $17
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $2, $pop67
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop45
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $17
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-NEXT:    i32.and $push9=, $1, $pop66
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop44
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $16, $pop65
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop43
 ; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-NEXT:    i32.and $push15=, $15, $pop64
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop42
+; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $17
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop41
 ; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $14, $pop63
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop39
 ; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $13, $pop62
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-NEXT:    i32.and $push21=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $17
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $12, $pop61
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.shr_u $push26=, $pop25, $17
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-NEXT:    i32.and $push27=, $3, $pop35
 ; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $17
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.shr_u $push32=, $pop31, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-NEXT:    i32.and $push35=, $10, $pop59
-; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-NEXT:    i32.and $push39=, $8, $pop58
-; NO-SIMD128-NEXT:    i32.shr_u $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-NEXT:    i32.and $push43=, $7, $pop57
-; NO-SIMD128-NEXT:    i32.shr_u $push44=, $pop43, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $6, $pop56
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $4, $pop55
-; NO-SIMD128-NEXT:    i32.shr_u $push52=, $pop51, $17
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v16i8:
@@ -4137,93 +3411,71 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $17, $pop72
-; NO-SIMD128-FAST-NEXT:    local.tee $push70=, $1=, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $17, $pop50
+; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $1=, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop48
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop42
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $9, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $11, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $12, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $13, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push26=, $pop25, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $10, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $14, $pop35
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push28=, $pop27, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $1
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push32=, $pop31, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $13, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push40=, $pop39, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $14, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push44=, $pop43, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $15, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $16, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push52=, $pop51, $1
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <16 x i8> undef, i8 %x, i32 0
   %s = shufflevector <16 x i8> %t, <16 x i8> undef,
@@ -4440,123 +3692,101 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $25, $pop101
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $5, $pop100
-; NO-SIMD128-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $21, $pop99
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $pop96
-; NO-SIMD128-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $18, $pop95
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $1, $pop94
-; NO-SIMD128-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $17, $pop93
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 15
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-NEXT:    i32.and $push17=, $16, $pop92
-; NO-SIMD128-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-NEXT:    i32.and $push16=, $32, $pop91
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push24=, 14
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-NEXT:    i32.and $push22=, $15, $pop90
-; NO-SIMD128-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-NEXT:    i32.and $push21=, $31, $pop89
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-NEXT:    i32.and $push27=, $14, $pop88
-; NO-SIMD128-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-NEXT:    i32.and $push26=, $30, $pop87
-; NO-SIMD128-NEXT:    i32.shr_u $push28=, $pop27, $pop26
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push34=, 12
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-NEXT:    i32.and $push32=, $13, $pop86
-; NO-SIMD128-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-NEXT:    i32.and $push31=, $29, $pop85
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push39=, 11
-; NO-SIMD128-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-NEXT:    i32.and $push37=, $12, $pop84
-; NO-SIMD128-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-NEXT:    i32.and $push36=, $28, $pop83
-; NO-SIMD128-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-NEXT:    i32.and $push47=, $10, $pop80
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-NEXT:    i32.and $push46=, $26, $pop79
-; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push54=, 7
-; NO-SIMD128-NEXT:    i32.add $push55=, $0, $pop54
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop79
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-NEXT:    i32.and $push52=, $8, $pop78
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop78
 ; NO-SIMD128-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-NEXT:    i32.and $push51=, $24, $pop77
-; NO-SIMD128-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-NEXT:    i32.const $push59=, 6
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop77
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-NEXT:    i32.and $push57=, $7, $pop76
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop76
 ; NO-SIMD128-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-NEXT:    i32.and $push56=, $23, $pop75
-; NO-SIMD128-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.const $push64=, 5
-; NO-SIMD128-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop75
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-NEXT:    i32.and $push62=, $6, $pop74
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop74
 ; NO-SIMD128-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-NEXT:    i32.and $push61=, $22, $pop73
-; NO-SIMD128-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-NEXT:    i32.const $push69=, 3
-; NO-SIMD128-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop73
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-NEXT:    i32.and $push67=, $4, $pop72
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop72
 ; NO-SIMD128-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-NEXT:    i32.and $push66=, $20, $pop71
-; NO-SIMD128-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop71
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop70
+; NO-SIMD128-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop69
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop68
+; NO-SIMD128-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop67
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop66
+; NO-SIMD128-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop65
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-NEXT:    i32.and $push26=, $8, $pop64
+; NO-SIMD128-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-NEXT:    i32.and $push25=, $24, $pop63
+; NO-SIMD128-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop27
+; NO-SIMD128-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-NEXT:    i32.and $push29=, $7, $pop62
+; NO-SIMD128-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-NEXT:    i32.and $push28=, $23, $pop61
+; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-NEXT:    i32.and $push32=, $6, $pop60
+; NO-SIMD128-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-NEXT:    i32.and $push31=, $22, $pop59
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop33
+; NO-SIMD128-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-NEXT:    i32.and $push35=, $5, $pop58
+; NO-SIMD128-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-NEXT:    i32.and $push34=, $21, $pop57
+; NO-SIMD128-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop36
+; NO-SIMD128-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-NEXT:    i32.and $push38=, $4, $pop56
+; NO-SIMD128-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-NEXT:    i32.and $push37=, $20, $pop55
+; NO-SIMD128-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop39
+; NO-SIMD128-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-NEXT:    i32.and $push41=, $3, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-NEXT:    i32.and $push40=, $19, $pop53
+; NO-SIMD128-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop42
+; NO-SIMD128-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-NEXT:    i32.and $push44=, $2, $pop52
+; NO-SIMD128-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-NEXT:    i32.and $push43=, $18, $pop51
+; NO-SIMD128-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop45
+; NO-SIMD128-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-NEXT:    i32.and $push47=, $1, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-NEXT:    i32.and $push46=, $17, $pop49
+; NO-SIMD128-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop48
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v16i8:
@@ -4564,122 +3794,100 @@ define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop100
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop99
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push86=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $9, $pop86
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $25, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push84=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $pop84
-; NO-SIMD128-FAST-NEXT:    i32.const $push83=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $26, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push38=, $pop37, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push45=, $0, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push82=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $11, $pop82
-; NO-SIMD128-FAST-NEXT:    i32.const $push81=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $27, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop45), $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push80=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $12, $pop80
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $28, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push55=, $0, $pop54
 ; NO-SIMD128-FAST-NEXT:    i32.const $push78=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push52=, $13, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.const $push77=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $29, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop55), $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push60=, $0, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push76=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $14, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop76
 ; NO-SIMD128-FAST-NEXT:    i32.const $push75=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push56=, $30, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push58=, $pop57, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push65=, $0, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push74=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push62=, $15, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop74
 ; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $31, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push63=, $pop62, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop65), $pop63
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push70=, $0, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push72=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push67=, $16, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop72
 ; NO-SIMD128-FAST-NEXT:    i32.const $push71=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push66=, $32, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push68=, $pop67, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop70), $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push64=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop64
+; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $25, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $10, $pop62
+; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $26, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $11, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $27, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop33
+; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $12, $pop58
+; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $28, $pop57
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push36=, $pop35, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $13, $pop56
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $29, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $14, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $30, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push42=, $pop41, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push44=, $15, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $31, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push45=, $pop44, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $16, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $32, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push48=, $pop47, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop48
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <16 x i8> %v, %x
   ret <16 x i8> %a
@@ -4701,60 +3909,38 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: and_v16i8:
 ; NO-SIMD128:         .functype and_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.and $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.and $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.and $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.and $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.and $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.and $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.and $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.and $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.and $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.and $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.and $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v16i8:
@@ -4766,54 +3952,32 @@ define <16 x i8> @and_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4835,60 +3999,38 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: or_v16i8:
 ; NO-SIMD128:         .functype or_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.or $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.or $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.or $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.or $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.or $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.or $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.or $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.or $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.or $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.or $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.or $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.or $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.or $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.or $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.or $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.or $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.or $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.or $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.or $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.or $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.or $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v16i8:
@@ -4900,54 +4042,32 @@ define <16 x i8> @or_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.or $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.or $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.or $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.or $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.or $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.or $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.or $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.or $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -4969,60 +4089,38 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: xor_v16i8:
 ; NO-SIMD128:         .functype xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $9, $25
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $21
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $19
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $18
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop3
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $17
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 15
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.xor $push5=, $16, $32
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 14
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $31
-; NO-SIMD128-NEXT:    i32.store8 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 13
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push11=, $14, $30
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $29
-; NO-SIMD128-NEXT:    i32.store8 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push18=, 11
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push20=, $11, $27
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push24=, 9
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.xor $push23=, $10, $26
-; NO-SIMD128-NEXT:    i32.store8 0($pop25), $pop23
-; NO-SIMD128-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push26=, $8, $24
-; NO-SIMD128-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.xor $push29=, $7, $23
-; NO-SIMD128-NEXT:    i32.store8 0($pop31), $pop29
-; NO-SIMD128-NEXT:    i32.const $push33=, 5
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push32=, $6, $22
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push36=, 3
-; NO-SIMD128-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-NEXT:    i32.xor $push35=, $4, $20
-; NO-SIMD128-NEXT:    i32.store8 0($pop37), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $15, $31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $14, $30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $13, $29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $12, $28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $11, $27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $10, $26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop7
+; NO-SIMD128-NEXT:    i32.xor $push8=, $8, $24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $7, $23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop9
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop10
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $4, $20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push13=, $3, $19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push14=, $2, $18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v16i8:
@@ -5034,54 +4132,32 @@ define <16 x i8> @xor_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop21), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop24), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop27), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop30), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop33), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, %y
   ret <16 x i8> %a
@@ -5104,75 +4180,53 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128:         .functype not_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $9, $pop0
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $5, $pop53
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop52
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $2, $pop51
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $1, $pop50
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push7=, 15
-; NO-SIMD128-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $16, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop8), $pop6
-; NO-SIMD128-NEXT:    i32.const $push10=, 14
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $15, $pop48
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $14, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $pop46
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push19=, 11
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop45
-; NO-SIMD128-NEXT:    i32.store8 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 10
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop44
-; NO-SIMD128-NEXT:    i32.store8 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.const $push25=, 9
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push28=, 7
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $8, $pop42
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $7, $pop41
-; NO-SIMD128-NEXT:    i32.store8 0($pop32), $pop30
-; NO-SIMD128-NEXT:    i32.const $push34=, 5
-; NO-SIMD128-NEXT:    i32.add $push35=, $0, $pop34
-; NO-SIMD128-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $6, $pop40
-; NO-SIMD128-NEXT:    i32.store8 0($pop35), $pop33
-; NO-SIMD128-NEXT:    i32.const $push37=, 3
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push36=, $4, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $15, $pop31
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $14, $pop30
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $13, $pop29
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $12, $pop28
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $8, $pop24
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $7, $pop23
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop22
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push12=, $5, $pop21
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop13
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v16i8:
@@ -5181,73 +4235,51 @@ define <16 x i8> @not_v16i8(<16 x i8> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop52
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop15), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $9, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop19), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $12, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop25), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $13, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $14, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop31), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $15, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $16, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop37), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push24=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $9, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $10, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $11, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1,
                           i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5274,91 +4306,69 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128:         .functype andnot_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $25, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $9, $pop1
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $21, $pop69
-; NO-SIMD128-NEXT:    i32.and $push4=, $5, $pop3
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $19, $pop68
-; NO-SIMD128-NEXT:    i32.and $push6=, $3, $pop5
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $18, $pop67
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop7
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $17, $pop66
-; NO-SIMD128-NEXT:    i32.and $push10=, $1, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push13=, 15
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $32, $pop65
-; NO-SIMD128-NEXT:    i32.and $push12=, $16, $pop11
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, 14
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $31, $pop64
-; NO-SIMD128-NEXT:    i32.and $push16=, $15, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push21=, 13
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $30, $pop63
-; NO-SIMD128-NEXT:    i32.and $push20=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.store8 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push25=, 12
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $29, $pop62
-; NO-SIMD128-NEXT:    i32.and $push24=, $13, $pop23
-; NO-SIMD128-NEXT:    i32.store8 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-NEXT:    i32.xor $push27=, $28, $pop61
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push33=, 10
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $27, $pop60
-; NO-SIMD128-NEXT:    i32.and $push32=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.const $push37=, 9
-; NO-SIMD128-NEXT:    i32.add $push38=, $0, $pop37
-; NO-SIMD128-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-NEXT:    i32.xor $push35=, $26, $pop59
-; NO-SIMD128-NEXT:    i32.and $push36=, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store8 0($pop38), $pop36
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-NEXT:    i32.xor $push39=, $24, $pop58
-; NO-SIMD128-NEXT:    i32.and $push40=, $8, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push45=, 6
-; NO-SIMD128-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $pop57
-; NO-SIMD128-NEXT:    i32.and $push44=, $7, $pop43
-; NO-SIMD128-NEXT:    i32.store8 0($pop46), $pop44
-; NO-SIMD128-NEXT:    i32.const $push49=, 5
-; NO-SIMD128-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $22, $pop56
-; NO-SIMD128-NEXT:    i32.and $push48=, $6, $pop47
-; NO-SIMD128-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-NEXT:    i32.const $push53=, 3
-; NO-SIMD128-NEXT:    i32.add $push54=, $0, $pop53
-; NO-SIMD128-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-NEXT:    i32.xor $push51=, $20, $pop55
-; NO-SIMD128-NEXT:    i32.and $push52=, $4, $pop51
-; NO-SIMD128-NEXT:    i32.store8 0($pop54), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push1=, $32, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $pop47
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop3
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $30, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $14, $pop5
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $29, $pop45
+; NO-SIMD128-NEXT:    i32.and $push8=, $13, $pop7
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $28, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop9
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $27, $pop43
+; NO-SIMD128-NEXT:    i32.and $push12=, $11, $pop11
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $26, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $10, $pop13
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $25, $pop41
+; NO-SIMD128-NEXT:    i32.and $push16=, $9, $pop15
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $24, $pop40
+; NO-SIMD128-NEXT:    i32.and $push18=, $8, $pop17
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push19=, $23, $pop39
+; NO-SIMD128-NEXT:    i32.and $push20=, $7, $pop19
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $22, $pop38
+; NO-SIMD128-NEXT:    i32.and $push22=, $6, $pop21
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop22
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $21, $pop37
+; NO-SIMD128-NEXT:    i32.and $push24=, $5, $pop23
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $20, $pop36
+; NO-SIMD128-NEXT:    i32.and $push26=, $4, $pop25
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop26
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push27=, $19, $pop35
+; NO-SIMD128-NEXT:    i32.and $push28=, $3, $pop27
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $18, $pop34
+; NO-SIMD128-NEXT:    i32.and $push30=, $2, $pop29
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop30
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push31=, $17, $pop33
+; NO-SIMD128-NEXT:    i32.and $push32=, $1, $pop31
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v16i8:
@@ -5368,88 +4378,66 @@ define <16 x i8> @andnot_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $17, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $19, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $21, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push64=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $23, $pop64
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $24, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop22), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $25, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $26, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $10, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop30
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $27, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $11, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop32), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $28, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $29, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.and $push42=, $13, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push44=, $0, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $30, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop44), $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push48=, $0, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $31, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push50=, $15, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop48), $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $32, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push54=, $16, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop52), $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $20, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $21, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $22, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $23, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $24, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $25, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $10, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $11, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $12, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $29, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $13, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $30, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $31, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $15, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $32, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <16 x i8> %y,
    <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -5477,124 +4465,102 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $47
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $pop100
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $46
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $45
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.and $push25=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $12, $pop98
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $44
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
 ; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.and $push31=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push32=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $43
-; NO-SIMD128-NEXT:    i32.or $push34=, $pop31, $pop33
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $10, $pop96
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
 ; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.and $push43=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push44=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $41
-; NO-SIMD128-NEXT:    i32.or $push46=, $pop43, $pop45
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.and $push47=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push48=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $40
-; NO-SIMD128-NEXT:    i32.or $push50=, $pop47, $pop49
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.and $push53=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $7, $pop93
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
 ; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.and $push59=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push60=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $38
-; NO-SIMD128-NEXT:    i32.or $push62=, $pop59, $pop61
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.and $push65=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push66=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $37
-; NO-SIMD128-NEXT:    i32.or $push68=, $pop65, $pop67
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.and $push69=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push70=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $36
-; NO-SIMD128-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.and $push75=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push76=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $35
-; NO-SIMD128-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.and $push79=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push80=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $34
-; NO-SIMD128-NEXT:    i32.or $push82=, $pop79, $pop81
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.and $push83=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push84=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $33
-; NO-SIMD128-NEXT:    i32.or $push86=, $pop83, $pop85
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
@@ -5607,117 +4573,95 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop96
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $9, $pop94
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
 ; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $10, $pop93
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
 ; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push54=, $pop51, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $12, $pop91
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
 ; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push66=, $pop63, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push69=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push72=, $pop69, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push75=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push78=, $pop75, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.and $push81=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push84=, $pop81, $pop83
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -5746,92 +4690,70 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 15
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 14
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 13
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $14
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 12
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push15=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $13
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $pop17
-; NO-SIMD128-NEXT:    i32.const $push23=, 11
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $12
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push28=, 10
-; NO-SIMD128-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-NEXT:    i32.xor $push25=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push26=, $pop25, $11
-; NO-SIMD128-NEXT:    i32.xor $push27=, $pop26, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop29), $pop27
-; NO-SIMD128-NEXT:    i32.const $push33=, 9
-; NO-SIMD128-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push30=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $10
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push35=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $9
-; NO-SIMD128-NEXT:    i32.xor $push37=, $pop36, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop37
-; NO-SIMD128-NEXT:    i32.const $push41=, 7
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $8
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.const $push46=, 6
-; NO-SIMD128-NEXT:    i32.add $push47=, $0, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push43=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push44=, $pop43, $7
-; NO-SIMD128-NEXT:    i32.xor $push45=, $pop44, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop47), $pop45
-; NO-SIMD128-NEXT:    i32.const $push51=, 5
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $6
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.xor $push53=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push54=, $pop53, $5
-; NO-SIMD128-NEXT:    i32.xor $push55=, $pop54, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop55
-; NO-SIMD128-NEXT:    i32.const $push59=, 3
-; NO-SIMD128-NEXT:    i32.add $push60=, $0, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push56=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push57=, $pop56, $4
-; NO-SIMD128-NEXT:    i32.xor $push58=, $pop57, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop60), $pop58
-; NO-SIMD128-NEXT:    i32.xor $push61=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push62=, $pop61, $3
-; NO-SIMD128-NEXT:    i32.xor $push63=, $pop62, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop63
-; NO-SIMD128-NEXT:    i32.xor $push64=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push65=, $pop64, $2
-; NO-SIMD128-NEXT:    i32.xor $push66=, $pop65, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop66
-; NO-SIMD128-NEXT:    i32.xor $push67=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push68=, $pop67, $1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $pop68, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop69
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v16i8:
@@ -5849,80 +4771,58 @@ define <16 x i8> @bitselect_xor_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $pop32, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $pop33, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop34
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push38=, $pop37, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop36), $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push41=, $0, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop41), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push46=, $0, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push48=, $pop47, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $pop48, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop46), $pop49
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push51=, $0, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop51), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push58=, $pop57, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push59=, $pop58, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push61=, $0, $pop60
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop61), $pop64
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push66=, $0, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.xor $push67=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push68=, $pop67, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $pop68, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop66), $pop69
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %and = and <16 x i8> %xor1, %c
@@ -5949,124 +4849,102 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v16i8:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 15
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $32, $48
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $48
-; NO-SIMD128-NEXT:    i32.store8 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $31, $47
-; NO-SIMD128-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $15, $pop101
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $47
-; NO-SIMD128-NEXT:    i32.store8 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 13
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $30, $46
-; NO-SIMD128-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $14, $pop100
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $31, $47
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $30, $46
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $29, $45
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $13, $pop77
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $46
-; NO-SIMD128-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push20=, $29, $45
-; NO-SIMD128-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $13, $pop99
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $45
-; NO-SIMD128-NEXT:    i32.store8 0($pop24), $pop22
-; NO-SIMD128-NEXT:    i32.const $push29=, 11
-; NO-SIMD128-NEXT:    i32.add $push30=, $0, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push26=, $28, $44
-; NO-SIMD128-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-NEXT:    i32.xor $push25=, $12, $pop98
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $28, $44
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $27, $43
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $26, $42
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $10, $pop74
 ; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $44
-; NO-SIMD128-NEXT:    i32.store8 0($pop30), $pop28
-; NO-SIMD128-NEXT:    i32.const $push35=, 10
-; NO-SIMD128-NEXT:    i32.add $push36=, $0, $pop35
-; NO-SIMD128-NEXT:    i32.xor $push32=, $27, $43
-; NO-SIMD128-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $11, $pop97
-; NO-SIMD128-NEXT:    i32.and $push33=, $pop32, $pop31
-; NO-SIMD128-NEXT:    i32.xor $push34=, $pop33, $43
-; NO-SIMD128-NEXT:    i32.store8 0($pop36), $pop34
-; NO-SIMD128-NEXT:    i32.const $push41=, 9
-; NO-SIMD128-NEXT:    i32.add $push42=, $0, $pop41
-; NO-SIMD128-NEXT:    i32.xor $push38=, $26, $42
-; NO-SIMD128-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $10, $pop96
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $25, $41
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push34=, $24, $40
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push33=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.xor $push38=, $23, $39
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push37=, $7, $pop71
 ; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $42
-; NO-SIMD128-NEXT:    i32.store8 0($pop42), $pop40
-; NO-SIMD128-NEXT:    i32.xor $push44=, $25, $41
-; NO-SIMD128-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-NEXT:    i32.xor $push43=, $9, $pop95
-; NO-SIMD128-NEXT:    i32.and $push45=, $pop44, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push46=, $pop45, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop46
-; NO-SIMD128-NEXT:    i32.const $push51=, 7
-; NO-SIMD128-NEXT:    i32.add $push52=, $0, $pop51
-; NO-SIMD128-NEXT:    i32.xor $push48=, $24, $40
-; NO-SIMD128-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $8, $pop94
-; NO-SIMD128-NEXT:    i32.and $push49=, $pop48, $pop47
-; NO-SIMD128-NEXT:    i32.xor $push50=, $pop49, $40
-; NO-SIMD128-NEXT:    i32.store8 0($pop52), $pop50
-; NO-SIMD128-NEXT:    i32.const $push57=, 6
-; NO-SIMD128-NEXT:    i32.add $push58=, $0, $pop57
-; NO-SIMD128-NEXT:    i32.xor $push54=, $23, $39
-; NO-SIMD128-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-NEXT:    i32.xor $push53=, $7, $pop93
+; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push42=, $22, $38
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push41=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push46=, $21, $37
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push45=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $pop45
+; NO-SIMD128-NEXT:    i32.xor $push48=, $pop47, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.xor $push50=, $20, $36
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push49=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-NEXT:    i32.xor $push52=, $pop51, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.xor $push54=, $19, $35
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push53=, $3, $pop67
 ; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $pop53
-; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $39
-; NO-SIMD128-NEXT:    i32.store8 0($pop58), $pop56
-; NO-SIMD128-NEXT:    i32.const $push63=, 5
-; NO-SIMD128-NEXT:    i32.add $push64=, $0, $pop63
-; NO-SIMD128-NEXT:    i32.xor $push60=, $22, $38
-; NO-SIMD128-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-NEXT:    i32.xor $push59=, $6, $pop92
-; NO-SIMD128-NEXT:    i32.and $push61=, $pop60, $pop59
-; NO-SIMD128-NEXT:    i32.xor $push62=, $pop61, $38
-; NO-SIMD128-NEXT:    i32.store8 0($pop64), $pop62
-; NO-SIMD128-NEXT:    i32.xor $push66=, $21, $37
-; NO-SIMD128-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-NEXT:    i32.xor $push65=, $5, $pop91
-; NO-SIMD128-NEXT:    i32.and $push67=, $pop66, $pop65
-; NO-SIMD128-NEXT:    i32.xor $push68=, $pop67, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop68
-; NO-SIMD128-NEXT:    i32.const $push73=, 3
-; NO-SIMD128-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-NEXT:    i32.xor $push70=, $20, $36
-; NO-SIMD128-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-NEXT:    i32.xor $push69=, $4, $pop90
-; NO-SIMD128-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-NEXT:    i32.xor $push72=, $pop71, $36
-; NO-SIMD128-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-NEXT:    i32.xor $push76=, $19, $35
-; NO-SIMD128-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-NEXT:    i32.xor $push75=, $3, $pop89
-; NO-SIMD128-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-NEXT:    i32.xor $push78=, $pop77, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop78
-; NO-SIMD128-NEXT:    i32.xor $push80=, $18, $34
-; NO-SIMD128-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-NEXT:    i32.xor $push79=, $2, $pop88
-; NO-SIMD128-NEXT:    i32.and $push81=, $pop80, $pop79
-; NO-SIMD128-NEXT:    i32.xor $push82=, $pop81, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop82
-; NO-SIMD128-NEXT:    i32.xor $push84=, $17, $33
-; NO-SIMD128-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-NEXT:    i32.xor $push83=, $1, $pop87
-; NO-SIMD128-NEXT:    i32.and $push85=, $pop84, $pop83
-; NO-SIMD128-NEXT:    i32.xor $push86=, $pop85, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop86
+; NO-SIMD128-NEXT:    i32.xor $push56=, $pop55, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.xor $push58=, $18, $34
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push57=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $pop57
+; NO-SIMD128-NEXT:    i32.xor $push60=, $pop59, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.xor $push62=, $17, $33
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push61=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-NEXT:    i32.xor $push64=, $pop63, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v16i8:
@@ -6079,117 +4957,95 @@ define <16 x i8> @bitselect_xor_reversed_v16i8(<16 x i8> %c, <16 x i8> %v1, <16
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $33
 ; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.const $push101=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop101
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop79
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $34
 ; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.const $push100=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop100
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop78
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $35
 ; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 3
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.const $push99=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop99
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop77
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.const $push98=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop98
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 5
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.const $push97=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop97
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.const $push96=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop96
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop73
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 7
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.const $push95=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop95
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop40), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.const $push94=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $9, $pop94
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $pop35, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push37=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.xor $push40=, $pop39, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $11, $pop70
 ; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 9
-; NO-SIMD128-FAST-NEXT:    i32.add $push50=, $0, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.const $push93=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $10, $pop93
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $12, $pop69
 ; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop50), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push56=, $0, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.const $push92=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push51=, $11, $pop92
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $pop52, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $pop53, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop56), $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 11
-; NO-SIMD128-FAST-NEXT:    i32.add $push62=, $0, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.const $push91=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $12, $pop91
+; NO-SIMD128-FAST-NEXT:    i32.xor $push48=, $pop47, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push49=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.xor $push52=, $pop51, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push53=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.xor $push56=, $pop55, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push57=, $15, $pop66
 ; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop62), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push68=, $0, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.const $push90=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push63=, $13, $pop90
-; NO-SIMD128-FAST-NEXT:    i32.and $push65=, $pop64, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.xor $push66=, $pop65, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop68), $pop66
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, 13
-; NO-SIMD128-FAST-NEXT:    i32.add $push74=, $0, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.xor $push70=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.const $push89=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push69=, $14, $pop89
-; NO-SIMD128-FAST-NEXT:    i32.and $push71=, $pop70, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.xor $push72=, $pop71, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop74), $pop72
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push80=, $0, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.xor $push76=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.const $push88=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push75=, $15, $pop88
-; NO-SIMD128-FAST-NEXT:    i32.and $push77=, $pop76, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.xor $push78=, $pop77, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop80), $pop78
-; NO-SIMD128-FAST-NEXT:    i32.const $push85=, 15
-; NO-SIMD128-FAST-NEXT:    i32.add $push86=, $0, $pop85
-; NO-SIMD128-FAST-NEXT:    i32.xor $push82=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.const $push87=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push81=, $16, $pop87
-; NO-SIMD128-FAST-NEXT:    i32.and $push83=, $pop82, $pop81
-; NO-SIMD128-FAST-NEXT:    i32.xor $push84=, $pop83, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($pop86), $pop84
+; NO-SIMD128-FAST-NEXT:    i32.xor $push60=, $pop59, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push61=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $pop61
+; NO-SIMD128-FAST-NEXT:    i32.xor $push64=, $pop63, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <16 x i8> %v1, %v2
  %notc = xor <16 x i8> %c, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -6218,30 +5074,22 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: add_v8i16:
 ; NO-SIMD128:         .functype add_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.add $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.add $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.add $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.add $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.add $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v8i16:
@@ -6253,24 +5101,16 @@ define <8 x i16> @add_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6292,30 +5132,22 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: sub_v8i16:
 ; NO-SIMD128:         .functype sub_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.sub $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.sub $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.sub $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.sub $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.sub $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.sub $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v8i16:
@@ -6327,24 +5159,16 @@ define <8 x i16> @sub_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6366,30 +5190,22 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: mul_v8i16:
 ; NO-SIMD128:         .functype mul_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.mul $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.mul $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.mul $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.mul $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.mul $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.mul $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.mul $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.mul $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v8i16:
@@ -6401,24 +5217,16 @@ define <8 x i16> @mul_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.mul $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.mul $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -6440,54 +5248,46 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_s_v8i16:
 ; NO-SIMD128:         .functype min_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.lt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.lt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.lt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.lt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.lt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.lt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v8i16:
@@ -6508,39 +5308,31 @@ define <8 x i16> @min_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.lt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.lt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6563,70 +5355,62 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: min_u_v8i16:
 ; NO-SIMD128:         .functype min_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.lt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.lt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.lt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.lt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.lt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.lt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.lt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v8i16:
@@ -6634,68 +5418,60 @@ define <8 x i16> @min_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.lt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.lt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6718,54 +5494,46 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_s_v8i16:
 ; NO-SIMD128:         .functype max_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.gt_s $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $16, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $15
-; NO-SIMD128-NEXT:    i32.gt_s $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $15, $pop8
-; NO-SIMD128-NEXT:    i32.store16 0($pop11), $pop9
-; NO-SIMD128-NEXT:    i32.const $push16=, 10
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $15, $pop6
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $14
+; NO-SIMD128-NEXT:    i32.gt_s $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $14, $pop10
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push12=, $13
 ; NO-SIMD128-NEXT:    i32.gt_s $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.select $push15=, $6, $14, $pop14
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-NEXT:    i32.const $push26=, 6
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $12
-; NO-SIMD128-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.select $push25=, $4, $12, $pop24
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $11
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $13, $pop14
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $12
+; NO-SIMD128-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.select $push19=, $4, $12, $pop18
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop19
+; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $11
+; NO-SIMD128-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-NEXT:    i32.select $push23=, $3, $11, $pop22
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop23
+; NO-SIMD128-NEXT:    i32.extend16_s $push25=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $10
+; NO-SIMD128-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-NEXT:    i32.select $push27=, $2, $10, $pop26
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop27
+; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push28=, $9
 ; NO-SIMD128-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.select $push31=, $3, $11, $pop30
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push33=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push32=, $10
-; NO-SIMD128-NEXT:    i32.gt_s $push34=, $pop33, $pop32
-; NO-SIMD128-NEXT:    i32.select $push35=, $2, $10, $pop34
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop35
-; NO-SIMD128-NEXT:    i32.extend16_s $push37=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push36=, $9
-; NO-SIMD128-NEXT:    i32.gt_s $push38=, $pop37, $pop36
-; NO-SIMD128-NEXT:    i32.select $push39=, $1, $9, $pop38
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop39
+; NO-SIMD128-NEXT:    i32.select $push31=, $1, $9, $pop30
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v8i16:
@@ -6786,39 +5554,31 @@ define <8 x i16> @max_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $3, $11, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $12
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $4, $12, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $13
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.select $push21=, $5, $13, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $14
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push24=, $pop23, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.select $push25=, $6, $14, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $0, $pop32
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $13
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.select $push19=, $5, $13, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop19
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $14
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push22=, $pop21, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.select $push23=, $6, $14, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push25=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push24=, $15
+; NO-SIMD128-FAST-NEXT:    i32.gt_s $push26=, $pop25, $pop24
+; NO-SIMD128-FAST-NEXT:    i32.select $push27=, $7, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop27
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push29=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $16
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $7, $15, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop33), $pop31
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $0, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push35=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push34=, $16
-; NO-SIMD128-FAST-NEXT:    i32.gt_s $push36=, $pop35, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.select $push37=, $8, $16, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop39), $pop37
+; NO-SIMD128-FAST-NEXT:    i32.select $push31=, $8, $16, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop31
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6841,70 +5601,62 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: max_u_v8i16:
 ; NO-SIMD128:         .functype max_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
-; NO-SIMD128-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop55
+; NO-SIMD128-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop47
 ; NO-SIMD128-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.select $push4=, $8, $16, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $7, $pop54
-; NO-SIMD128-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $pop53
-; NO-SIMD128-NEXT:    i32.gt_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.select $push10=, $7, $15, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $6, $pop52
-; NO-SIMD128-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $pop51
-; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.select $push16=, $6, $14, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-NEXT:    i32.and $push20=, $5, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $pop49
-; NO-SIMD128-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $4, $pop48
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $pop47
-; NO-SIMD128-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.select $push26=, $4, $12, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push30=, $3, $pop46
+; NO-SIMD128-NEXT:    i32.and $push6=, $7, $pop46
 ; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.select $push32=, $3, $11, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop45
+; NO-SIMD128-NEXT:    i32.gt_u $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.select $push8=, $7, $15, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
 ; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push34=, $2, $pop44
+; NO-SIMD128-NEXT:    i32.and $push10=, $6, $pop44
 ; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.gt_u $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.select $push36=, $2, $10, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $pop43
+; NO-SIMD128-NEXT:    i32.gt_u $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.select $push12=, $6, $14, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
 ; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push38=, $1, $pop42
+; NO-SIMD128-NEXT:    i32.and $push14=, $5, $pop42
 ; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.gt_u $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.select $push40=, $1, $9, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $pop41
+; NO-SIMD128-NEXT:    i32.gt_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.select $push16=, $5, $13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-NEXT:    i32.and $push18=, $4, $pop40
+; NO-SIMD128-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $pop39
+; NO-SIMD128-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.select $push20=, $4, $12, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $3, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $pop37
+; NO-SIMD128-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.select $push24=, $3, $11, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-NEXT:    i32.and $push26=, $2, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $pop35
+; NO-SIMD128-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.select $push28=, $2, $10, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-NEXT:    i32.and $push30=, $1, $pop34
+; NO-SIMD128-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $pop33
+; NO-SIMD128-NEXT:    i32.gt_u $push31=, $pop30, $pop29
+; NO-SIMD128-NEXT:    i32.select $push32=, $1, $9, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v8i16:
@@ -6912,68 +5664,60 @@ define <8 x i16> @max_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.select $push4=, $1, $9, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $2, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop45
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.select $push8=, $2, $10, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $3, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.select $push12=, $3, $11, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $4, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $pop41
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.select $push16=, $4, $12, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $5, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.select $push22=, $5, $13, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $6, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.select $push26=, $6, $14, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $7, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $5, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.select $push20=, $5, $13, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.select $push24=, $6, $14, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.gt_u $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.select $push28=, $7, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $7, $15, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push36=, $8, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.gt_u $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.select $push38=, $8, $16, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.select $push32=, $8, $16, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <8 x i16> %x, %y
   %a = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
@@ -6996,78 +5740,70 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16:
 ; NO-SIMD128:         .functype avgr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16:
@@ -7078,73 +5814,65 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add nuw <8 x i16> %x, %y
   %b = add nuw <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7176,78 +5904,70 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: avgr_u_v8i16_wrap:
 ; NO-SIMD128:         .functype avgr_u_v8i16_wrap (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.add $push2=, $8, $16
-; NO-SIMD128-NEXT:    i32.const $push3=, 1
-; NO-SIMD128-NEXT:    i32.add $push4=, $pop2, $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 65534
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop4, $pop5
-; NO-SIMD128-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push7=, $pop6, $pop63
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $pop7
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.add $push10=, $7, $15
-; NO-SIMD128-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop62
-; NO-SIMD128-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop61
-; NO-SIMD128-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop60
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop13
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.add $push16=, $6, $14
-; NO-SIMD128-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-NEXT:    i32.add $push17=, $pop16, $pop59
-; NO-SIMD128-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-NEXT:    i32.and $push18=, $pop17, $pop58
-; NO-SIMD128-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push19=, $pop18, $pop57
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-NEXT:    i32.add $push21=, $pop20, $pop56
-; NO-SIMD128-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $pop55
+; NO-SIMD128-NEXT:    i32.add $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.const $push1=, 1
+; NO-SIMD128-NEXT:    i32.add $push2=, $pop0, $pop1
+; NO-SIMD128-NEXT:    i32.const $push3=, 65534
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop2, $pop3
+; NO-SIMD128-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push5=, $pop4, $pop55
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.add $push6=, $7, $15
 ; NO-SIMD128-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push23=, $pop22, $pop54
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.const $push24=, 6
-; NO-SIMD128-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-NEXT:    i32.add $push26=, $4, $12
-; NO-SIMD128-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop53
-; NO-SIMD128-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop52
+; NO-SIMD128-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop52
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop9
+; NO-SIMD128-NEXT:    i32.add $push10=, $6, $14
 ; NO-SIMD128-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop51
-; NO-SIMD128-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-NEXT:    i32.add $push30=, $3, $11
-; NO-SIMD128-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop50
-; NO-SIMD128-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop49
+; NO-SIMD128-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop13
+; NO-SIMD128-NEXT:    i32.add $push14=, $5, $13
 ; NO-SIMD128-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop48
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop33
-; NO-SIMD128-NEXT:    i32.add $push34=, $2, $10
-; NO-SIMD128-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-NEXT:    i32.add $push35=, $pop34, $pop47
-; NO-SIMD128-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-NEXT:    i32.and $push36=, $pop35, $pop46
+; NO-SIMD128-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
+; NO-SIMD128-NEXT:    i32.add $push18=, $4, $12
 ; NO-SIMD128-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push37=, $pop36, $pop45
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop37
-; NO-SIMD128-NEXT:    i32.add $push38=, $1, $9
-; NO-SIMD128-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop21
+; NO-SIMD128-NEXT:    i32.add $push22=, $3, $11
 ; NO-SIMD128-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop41
+; NO-SIMD128-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
+; NO-SIMD128-NEXT:    i32.add $push26=, $2, $10
+; NO-SIMD128-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
+; NO-SIMD128-NEXT:    i32.add $push30=, $1, $9
+; NO-SIMD128-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop33
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: avgr_u_v8i16_wrap:
@@ -7258,73 +5978,65 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $pop0, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 65534
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop2, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push63=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $pop4, $pop55
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $2, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push62=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop62
-; NO-SIMD128-FAST-NEXT:    i32.const $push61=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop61
-; NO-SIMD128-FAST-NEXT:    i32.const $push60=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop60
+; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push7=, $pop6, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $pop7, $pop53
+; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $3, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push59=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.const $push58=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop58
-; NO-SIMD128-FAST-NEXT:    i32.const $push57=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop57
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push56=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $pop16, $pop56
-; NO-SIMD128-FAST-NEXT:    i32.const $push55=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $pop17, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.const $push54=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push19=, $pop18, $pop54
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop19
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push21=, $pop20, $pop53
-; NO-SIMD128-FAST-NEXT:    i32.const $push52=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $pop52
 ; NO-SIMD128-FAST-NEXT:    i32.const $push51=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push23=, $pop22, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push24=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push25=, $0, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop50
-; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $pop10, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $pop50
+; NO-SIMD128-FAST-NEXT:    i32.const $push49=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push13=, $pop12, $pop49
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $4, $12
 ; NO-SIMD128-FAST-NEXT:    i32.const $push48=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop48
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop25), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push33=, $pop32, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $pop14, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $5, $13
 ; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push35=, $pop34, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop31), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push37=, $0, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.add $push38=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 1
-; NO-SIMD128-FAST-NEXT:    i32.add $push39=, $pop38, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65534
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $pop18, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $6, $14
 ; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 1
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push41=, $pop40, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop37), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $pop22, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $pop23, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop40
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop25
+; NO-SIMD128-FAST-NEXT:    i32.add $push26=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $pop26, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push29=, $pop28, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.add $push30=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 1
+; NO-SIMD128-FAST-NEXT:    i32.add $push31=, $pop30, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65534
+; NO-SIMD128-FAST-NEXT:    i32.and $push32=, $pop31, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 1
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push33=, $pop32, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop33
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <8 x i16> %x, %y
   %b = add <8 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -7348,70 +6060,62 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-LABEL: abs_v8i16:
 ; NO-SIMD128:         .functype abs_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 14
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop54
+; NO-SIMD128-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop46
 ; NO-SIMD128-NEXT:    i32.sub $push3=, $pop2, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $pop3
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $7
-; NO-SIMD128-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push52=, $pop6, $pop53
-; NO-SIMD128-NEXT:    local.tee $push51=, $8=, $pop52
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop51
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push14=, 10
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $6
-; NO-SIMD128-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push49=, $pop11, $pop50
-; NO-SIMD128-NEXT:    local.tee $push48=, $8=, $pop49
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $pop48
-; NO-SIMD128-NEXT:    i32.sub $push13=, $pop12, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
-; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push46=, $pop16, $pop47
-; NO-SIMD128-NEXT:    local.tee $push45=, $8=, $pop46
-; NO-SIMD128-NEXT:    i32.xor $push17=, $5, $pop45
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-NEXT:    local.tee $push43=, $8=, $pop44
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop43
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-NEXT:    local.tee $push40=, $8=, $pop41
+; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $pop40
+; NO-SIMD128-NEXT:    i32.sub $push9=, $pop8, $8
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-NEXT:    local.tee $push37=, $8=, $pop38
+; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.sub $push12=, $pop11, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-NEXT:    local.tee $push34=, $8=, $pop35
+; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.sub $push15=, $pop14, $8
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-NEXT:    local.tee $push31=, $8=, $pop32
+; NO-SIMD128-NEXT:    i32.xor $push17=, $3, $pop31
 ; NO-SIMD128-NEXT:    i32.sub $push18=, $pop17, $8
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop18
-; NO-SIMD128-NEXT:    i32.const $push22=, 6
-; NO-SIMD128-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push43=, $pop19, $pop44
-; NO-SIMD128-NEXT:    local.tee $push42=, $8=, $pop43
-; NO-SIMD128-NEXT:    i32.xor $push20=, $4, $pop42
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-NEXT:    local.tee $push28=, $8=, $pop29
+; NO-SIMD128-NEXT:    i32.xor $push20=, $2, $pop28
 ; NO-SIMD128-NEXT:    i32.sub $push21=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop23), $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $3
-; NO-SIMD128-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push40=, $pop24, $pop41
-; NO-SIMD128-NEXT:    local.tee $push39=, $8=, $pop40
-; NO-SIMD128-NEXT:    i32.xor $push25=, $3, $pop39
-; NO-SIMD128-NEXT:    i32.sub $push26=, $pop25, $8
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push27=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push37=, $pop27, $pop38
-; NO-SIMD128-NEXT:    local.tee $push36=, $8=, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push28=, $2, $pop36
-; NO-SIMD128-NEXT:    i32.sub $push29=, $pop28, $8
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop29
-; NO-SIMD128-NEXT:    i32.extend16_s $push30=, $1
-; NO-SIMD128-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-NEXT:    i32.shr_s $push34=, $pop30, $pop35
-; NO-SIMD128-NEXT:    local.tee $push33=, $8=, $pop34
-; NO-SIMD128-NEXT:    i32.xor $push31=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.sub $push32=, $pop31, $8
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-NEXT:    local.tee $push25=, $8=, $pop26
+; NO-SIMD128-NEXT:    i32.xor $push23=, $1, $pop25
+; NO-SIMD128-NEXT:    i32.sub $push24=, $pop23, $8
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v8i16:
@@ -7419,68 +6123,60 @@ define <8 x i16> @abs_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push0=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push1=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push55=, $pop0, $pop1
-; NO-SIMD128-FAST-NEXT:    local.tee $push54=, $9=, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop54
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push47=, $pop0, $pop1
+; NO-SIMD128-FAST-NEXT:    local.tee $push46=, $9=, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop46
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop2, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push53=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push52=, $pop4, $pop53
-; NO-SIMD128-FAST-NEXT:    local.tee $push51=, $1=, $pop52
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push44=, $pop4, $pop45
+; NO-SIMD128-FAST-NEXT:    local.tee $push43=, $1=, $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop43
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push50=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push49=, $pop7, $pop50
-; NO-SIMD128-FAST-NEXT:    local.tee $push48=, $2=, $pop49
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop48
+; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push41=, $pop7, $pop42
+; NO-SIMD128-FAST-NEXT:    local.tee $push40=, $2=, $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $3, $pop40
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push9=, $pop8, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push46=, $pop10, $pop47
-; NO-SIMD128-FAST-NEXT:    local.tee $push45=, $3=, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push38=, $pop10, $pop39
+; NO-SIMD128-FAST-NEXT:    local.tee $push37=, $3=, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push12=, $pop11, $3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push43=, $pop15, $pop44
-; NO-SIMD128-FAST-NEXT:    local.tee $push42=, $4=, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $5, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.sub $push17=, $pop16, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push18=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push40=, $pop18, $pop41
-; NO-SIMD128-FAST-NEXT:    local.tee $push39=, $5=, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $6, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.sub $push20=, $pop19, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push37=, $pop23, $pop38
-; NO-SIMD128-FAST-NEXT:    local.tee $push36=, $6=, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $7, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.sub $push25=, $pop24, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push28=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 15
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push34=, $pop28, $pop35
-; NO-SIMD128-FAST-NEXT:    local.tee $push33=, $0=, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.sub $push30=, $pop29, $0
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push35=, $pop13, $pop36
+; NO-SIMD128-FAST-NEXT:    local.tee $push34=, $4=, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $5, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.sub $push15=, $pop14, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop16, $pop33
+; NO-SIMD128-FAST-NEXT:    local.tee $push31=, $5=, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $6, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.sub $push18=, $pop17, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push29=, $pop19, $pop30
+; NO-SIMD128-FAST-NEXT:    local.tee $push28=, $6=, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.sub $push21=, $pop20, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 15
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push26=, $pop22, $pop27
+; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $7=, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $8, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.sub $push24=, $pop23, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> zeroinitializer, %x
   %b = icmp slt <8 x i16> %x, zeroinitializer
@@ -7505,37 +6201,29 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype neg_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $5
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop22, $2
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-NEXT:    i32.sub $push5=, $pop20, $8
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-NEXT:    i32.sub $push8=, $pop19, $7
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-NEXT:    i32.sub $push11=, $pop18, $6
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-NEXT:    i32.sub $push14=, $pop17, $4
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop15, $7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop14, $6
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop13, $5
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-NEXT:    i32.sub $push5=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-NEXT:    i32.sub $push6=, $pop11, $3
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-NEXT:    i32.sub $push7=, $pop10, $2
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-NEXT:    i32.sub $push8=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v8i16:
@@ -7544,35 +6232,27 @@ define <8 x i16> @neg_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop23, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop15, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop22, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop14, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop21, $4
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop20, $5
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push10=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push13=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push16=, $pop17, $8
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop13, $4
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop11, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push7=, $pop10, $7
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop9, $8
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>,
                      %x
@@ -7596,64 +6276,48 @@ define <8 x i16> @shl_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop17
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop9
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $9
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $9
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $9
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v8i16:
 ; NO-SIMD128-FAST:         .functype shl_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $9=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $9=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $2, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -7681,37 +6345,29 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128:         .functype shl_const_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-NEXT:    i32.shl $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-NEXT:    i32.shl $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.shl $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-NEXT:    i32.shl $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.shl $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-NEXT:    i32.shl $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-NEXT:    i32.shl $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-NEXT:    i32.shl $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-NEXT:    i32.shl $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v8i16:
@@ -7720,35 +6376,27 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v,
     <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -7866,45 +6514,37 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shl_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.shl $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.shl $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.shl $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.shl $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.shl $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.shl $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.shl $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.shl $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.shl $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.shl $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.shl $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.shl $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.shl $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.shl $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.shl $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.shl $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v8i16:
@@ -7914,42 +6554,34 @@ define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shl $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shl $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.shl $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shl $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.shl $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shl $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shl $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -7971,41 +6603,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: shr_s_v8i16:
 ; NO-SIMD128:         .functype shr_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-NEXT:    local.tee $push25=, $9=, $pop26
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop25
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $3
+; NO-SIMD128-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-NEXT:    local.tee $push17=, $9=, $pop18
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $pop1, $pop17
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $7
 ; NO-SIMD128-NEXT:    i32.shr_s $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $6
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
 ; NO-SIMD128-NEXT:    i32.shr_s $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $8
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $4
 ; NO-SIMD128-NEXT:    i32.shr_s $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $3
+; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push13=, $2
 ; NO-SIMD128-NEXT:    i32.shr_s $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $6
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.extend16_s $push21=, $4
-; NO-SIMD128-NEXT:    i32.shr_s $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend16_s $push15=, $1
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v8i16:
@@ -8013,9 +6637,9 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push1=, $1
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push26=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push25=, $1=, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $9, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $pop1, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push3=, $2
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push4=, $pop3, $1
@@ -8023,29 +6647,21 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $3
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push7=, $4
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push8=, $pop7, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $5
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $pop9, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $6
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $6
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $7
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push15=, $8
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $pop15, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push20=, $pop19, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8164,54 +6780,46 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v8i16:
 ; NO-SIMD128:         .functype shr_s_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push2=, $8
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.shr_s $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $3
-; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop39
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
+; NO-SIMD128-NEXT:    i32.extend16_s $push5=, $7
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop31
 ; NO-SIMD128-NEXT:    i32.shr_s $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $2
-; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop38
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-NEXT:    i32.extend16_s $push8=, $6
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop30
 ; NO-SIMD128-NEXT:    i32.shr_s $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $1
-; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop37
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
+; NO-SIMD128-NEXT:    i32.extend16_s $push11=, $5
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop29
 ; NO-SIMD128-NEXT:    i32.shr_s $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $8
-; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop36
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.extend16_s $push14=, $4
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop28
 ; NO-SIMD128-NEXT:    i32.shr_s $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.extend16_s $push19=, $7
-; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop35
-; NO-SIMD128-NEXT:    i32.shr_s $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
-; NO-SIMD128-NEXT:    i32.extend16_s $push24=, $6
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop34
-; NO-SIMD128-NEXT:    i32.shr_s $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
-; NO-SIMD128-NEXT:    i32.extend16_s $push29=, $4
-; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_s $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.extend16_s $push17=, $3
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop27
+; NO-SIMD128-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.extend16_s $push20=, $2
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop26
+; NO-SIMD128-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.extend16_s $push23=, $1
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v8i16:
@@ -8223,48 +6831,40 @@ define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push5=, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop31
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push6=, $pop5, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push8=, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push9=, $pop8, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push13=, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $12, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push14=, $pop13, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push16=, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push21=, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $14, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push22=, $pop21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop19), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push26=, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push27=, $pop26, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop27
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push29=, $0, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push31=, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push32=, $pop31, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop29), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push14=, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push17=, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push20=, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push23=, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8287,48 +6887,40 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128:         .functype shr_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-NEXT:    local.tee $push32=, $9=, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop32
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-NEXT:    i32.and $push3=, $3, $pop31
+; NO-SIMD128-NEXT:    i32.and $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-NEXT:    local.tee $push24=, $9=, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $pop1, $pop24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-NEXT:    i32.and $push3=, $7, $pop23
 ; NO-SIMD128-NEXT:    i32.shr_u $push4=, $pop3, $9
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop30
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop22
 ; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $9
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $1, $pop29
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-NEXT:    i32.and $push7=, $5, $pop21
 ; NO-SIMD128-NEXT:    i32.shr_u $push8=, $pop7, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-NEXT:    i32.and $push9=, $8, $pop28
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-NEXT:    i32.and $push9=, $4, $pop20
 ; NO-SIMD128-NEXT:    i32.shr_u $push10=, $pop9, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $7, $pop27
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $9
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $2, $pop18
 ; NO-SIMD128-NEXT:    i32.shr_u $push14=, $pop13, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-NEXT:    i32.and $push17=, $6, $pop26
-; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-NEXT:    i32.and $push21=, $4, $pop25
-; NO-SIMD128-NEXT:    i32.shr_u $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-NEXT:    i32.and $push15=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.shr_u $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v8i16:
@@ -8336,47 +6928,39 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $pop34
-; NO-SIMD128-FAST-NEXT:    local.tee $push32=, $1=, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $9, $pop26
+; NO-SIMD128-FAST-NEXT:    local.tee $push24=, $1=, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $pop1, $pop24
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $2, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $3, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $4, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push8=, $pop7, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $5, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $5, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push10=, $pop9, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $6, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $6, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $7, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push14=, $pop13, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $7, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $8, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push22=, $pop21, $1
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push16=, $pop15, $1
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <8 x i16> undef, i16 %x, i32 0
   %s = shufflevector <8 x i16> %t, <8 x i16> undef,
@@ -8496,61 +7080,53 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128:         .functype shr_u_vec_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $13, $pop47
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop45
-; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop43
-; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop41
-; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-NEXT:    i32.and $push13=, $16, $pop39
-; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop39
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-NEXT:    i32.and $push18=, $15, $pop37
-; NO-SIMD128-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop37
+; NO-SIMD128-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-NEXT:    i32.and $push23=, $14, $pop35
-; NO-SIMD128-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop35
+; NO-SIMD128-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-NEXT:    i32.and $push28=, $12, $pop33
-; NO-SIMD128-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop33
+; NO-SIMD128-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-NEXT:    i32.and $push13=, $12, $pop31
+; NO-SIMD128-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-NEXT:    i32.and $push16=, $11, $pop29
+; NO-SIMD128-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-NEXT:    i32.and $push19=, $10, $pop27
+; NO-SIMD128-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-NEXT:    i32.and $push22=, $9, $pop25
+; NO-SIMD128-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v8i16:
@@ -8558,60 +7134,52 @@ define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $13, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $14, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $15, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $16, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $13, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $14, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $15, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $16, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <8 x i16> %v, %x
   ret <8 x i16> %a
@@ -8633,30 +7201,22 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: and_v8i16:
 ; NO-SIMD128:         .functype and_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.and $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.and $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.and $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.and $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.and $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.and $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v8i16:
@@ -8668,24 +7228,16 @@ define <8 x i16> @and_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8707,30 +7259,22 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: or_v8i16:
 ; NO-SIMD128:         .functype or_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.or $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.or $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.or $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.or $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.or $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.or $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.or $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.or $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.or $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.or $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v8i16:
@@ -8742,24 +7286,16 @@ define <8 x i16> @or_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.or $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.or $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8781,30 +7317,22 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: xor_v8i16:
 ; NO-SIMD128:         .functype xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $5, $13
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $11
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $10
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    i32.xor $push4=, $8, $16
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $15
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push11=, 10
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $14
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push14=, 6
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.xor $push13=, $4, $12
-; NO-SIMD128-NEXT:    i32.store16 0($pop15), $pop13
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $16
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $6, $14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $5, $13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $2, $10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v8i16:
@@ -8816,24 +7344,16 @@ define <8 x i16> @xor_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $12
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop4), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $5, $13
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $6, $14
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push10=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $7, $15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop11), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $8, $16
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $12
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $5, $13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $6, $14
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $15
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $16
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, %y
   ret <8 x i16> %a
@@ -8856,37 +7376,29 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128:         .functype not_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop22
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push6=, 14
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $8, $pop20
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $pop5
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop19
-; NO-SIMD128-NEXT:    i32.store16 0($pop10), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 10
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $6, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop13), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 6
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop1
+; NO-SIMD128-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $7, $pop15
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop14
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $5, $pop13
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $4, $pop12
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop5
+; NO-SIMD128-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop7
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push8=, $1, $pop9
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v8i16:
@@ -8895,35 +7407,27 @@ define <8 x i16> @not_v8i16(<8 x i16> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.const $push15=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop5), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $5, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $6, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop9), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $7, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop12), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $8, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop15), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $5, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $7, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1,
                           i16 -1, i16 -1, i16 -1, i16 -1>
@@ -8948,45 +7452,37 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128:         .functype andnot_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $11, $pop31
-; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $10, $pop30
-; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $9, $pop29
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push11=, 14
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-NEXT:    i32.xor $push9=, $16, $pop28
-; NO-SIMD128-NEXT:    i32.and $push10=, $8, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push15=, 12
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $15, $pop27
-; NO-SIMD128-NEXT:    i32.and $push14=, $7, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push19=, 10
-; NO-SIMD128-NEXT:    i32.add $push20=, $0, $pop19
-; NO-SIMD128-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-NEXT:    i32.xor $push17=, $14, $pop26
-; NO-SIMD128-NEXT:    i32.and $push18=, $6, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop20), $pop18
-; NO-SIMD128-NEXT:    i32.const $push23=, 6
-; NO-SIMD128-NEXT:    i32.add $push24=, $0, $pop23
-; NO-SIMD128-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-NEXT:    i32.xor $push21=, $12, $pop25
-; NO-SIMD128-NEXT:    i32.and $push22=, $4, $pop21
-; NO-SIMD128-NEXT:    i32.store16 0($pop24), $pop22
+; NO-SIMD128-NEXT:    i32.xor $push1=, $16, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $pop23
+; NO-SIMD128-NEXT:    i32.and $push4=, $7, $pop3
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $14, $pop22
+; NO-SIMD128-NEXT:    i32.and $push6=, $6, $pop5
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $13, $pop21
+; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $12, $pop20
+; NO-SIMD128-NEXT:    i32.and $push10=, $4, $pop9
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop10
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $11, $pop19
+; NO-SIMD128-NEXT:    i32.and $push12=, $3, $pop11
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $10, $pop18
+; NO-SIMD128-NEXT:    i32.and $push14=, $2, $pop13
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop14
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push15=, $9, $pop17
+; NO-SIMD128-NEXT:    i32.and $push16=, $1, $pop15
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v8i16:
@@ -8996,42 +7492,34 @@ define <8 x i16> @andnot_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $9, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.const $push23=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $pop23
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push30=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push22=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $11, $pop22
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push29=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop8), $pop10
-; NO-SIMD128-FAST-NEXT:    i32.const $push28=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $13, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $5, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $6, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $15, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push25=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $16, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $8, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $12, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $13, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $5, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $14, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $6, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $15, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $7, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $16, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $8, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <8 x i16> %y,
    <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -9058,62 +7546,54 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.and $push7=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $23, $pop8
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop7, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.and $push13=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $6, $pop46
-; NO-SIMD128-NEXT:    i32.and $push15=, $22, $pop14
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
 ; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push20=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.and $push23=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push24=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $20, $pop24
-; NO-SIMD128-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.and $push29=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $3, $pop43
-; NO-SIMD128-NEXT:    i32.and $push31=, $19, $pop30
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
 ; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $18, $pop34
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $17, $pop38
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
@@ -9126,55 +7606,47 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $21, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.or $push22=, $pop19, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $22, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.or $push26=, $pop23, $pop25
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $7, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $23, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
 ; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $24, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.or $push38=, $pop35, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9203,46 +7675,38 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 14
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.xor $push5=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $7
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop9), $pop7
-; NO-SIMD128-NEXT:    i32.const $push13=, 10
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $6
-; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-NEXT:    i32.xor $push15=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $5
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-NEXT:    i32.const $push21=, 6
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $4
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push23=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push24=, $pop23, $3
-; NO-SIMD128-NEXT:    i32.xor $push25=, $pop24, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop25
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $2
-; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.xor $push29=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push30=, $pop29, $1
-; NO-SIMD128-NEXT:    i32.xor $push31=, $pop30, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop31
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v8i16:
@@ -9260,34 +7724,26 @@ define <8 x i16> @bitselect_xor_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $pop19, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $pop20, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push30=, $pop29, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push31=, $pop30, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %and = and <8 x i16> %xor1, %c
@@ -9314,62 +7770,54 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v8i16:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 14
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $24
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $24
-; NO-SIMD128-NEXT:    i32.store16 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push11=, 12
-; NO-SIMD128-NEXT:    i32.add $push12=, $0, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push8=, $15, $23
-; NO-SIMD128-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $7, $pop47
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $23
-; NO-SIMD128-NEXT:    i32.store16 0($pop12), $pop10
-; NO-SIMD128-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-NEXT:    i32.xor $push14=, $14, $22
-; NO-SIMD128-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $6, $pop46
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $23
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $22
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $21
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $5, $pop37
 ; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $22
-; NO-SIMD128-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-NEXT:    i32.xor $push19=, $5, $pop45
-; NO-SIMD128-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-NEXT:    i32.const $push27=, 6
-; NO-SIMD128-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-NEXT:    i32.xor $push24=, $12, $20
-; NO-SIMD128-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $4, $pop44
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $20
-; NO-SIMD128-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push30=, $11, $19
-; NO-SIMD128-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-NEXT:    i32.xor $push29=, $3, $pop43
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $20
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push17=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $19
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push21=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.xor $push24=, $pop23, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $18
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push25=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-NEXT:    i32.xor $push28=, $pop27, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $17
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push29=, $1, $pop33
 ; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push34=, $10, $18
-; NO-SIMD128-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-NEXT:    i32.xor $push33=, $2, $pop42
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $pop33
-; NO-SIMD128-NEXT:    i32.xor $push36=, $pop35, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop36
-; NO-SIMD128-NEXT:    i32.xor $push38=, $9, $17
-; NO-SIMD128-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-NEXT:    i32.xor $push37=, $1, $pop41
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $pop37
-; NO-SIMD128-NEXT:    i32.xor $push40=, $pop39, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop40
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v8i16:
@@ -9382,55 +7830,47 @@ define <8 x i16> @bitselect_xor_reversed_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $17
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $18
 ; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop46
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $19
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop45
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop37
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push19=, $5, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $pop21, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop22
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $6, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push34=, $0, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $7, $pop42
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $pop23, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push25=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.xor $push28=, $pop27, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $8, $pop33
 ; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop34), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push40=, $0, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $8, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop40), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <8 x i16> %v1, %v2
  %notc = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1,
@@ -9458,46 +7898,38 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v8i16:
 ; NO-SIMD128:         .functype extmul_low_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $5
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $21
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $24
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $3
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $19
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $23
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $2
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $18
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $22
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $1
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $17
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $21
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $8
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $24
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $4
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $20
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $7
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $23
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $6
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $22
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $4
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $20
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $3
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $19
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $2
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $18
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $1
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $17
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v8i16:
@@ -9515,34 +7947,26 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $20
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $5
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $21
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $6
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $22
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $7
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $23
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $24
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $20
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $5
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $21
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $6
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $22
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $7
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $23
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $24
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9572,46 +7996,38 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v8i16:
 ; NO-SIMD128:         .functype extmul_high_s_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $13
-; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $29
+; NO-SIMD128-NEXT:    i32.extend8_s $push1=, $16
+; NO-SIMD128-NEXT:    i32.extend8_s $push0=, $32
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $11
-; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $27
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend8_s $push4=, $15
+; NO-SIMD128-NEXT:    i32.extend8_s $push3=, $31
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $10
-; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $26
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend8_s $push7=, $14
+; NO-SIMD128-NEXT:    i32.extend8_s $push6=, $30
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $9
-; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $25
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend8_s $push10=, $13
+; NO-SIMD128-NEXT:    i32.extend8_s $push9=, $29
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop11
-; NO-SIMD128-NEXT:    i32.const $push15=, 14
-; NO-SIMD128-NEXT:    i32.add $push16=, $0, $pop15
-; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $16
-; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $32
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.extend8_s $push13=, $12
+; NO-SIMD128-NEXT:    i32.extend8_s $push12=, $28
 ; NO-SIMD128-NEXT:    i32.mul $push14=, $pop13, $pop12
-; NO-SIMD128-NEXT:    i32.store16 0($pop16), $pop14
-; NO-SIMD128-NEXT:    i32.const $push20=, 12
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $15
-; NO-SIMD128-NEXT:    i32.extend8_s $push17=, $31
-; NO-SIMD128-NEXT:    i32.mul $push19=, $pop18, $pop17
-; NO-SIMD128-NEXT:    i32.store16 0($pop21), $pop19
-; NO-SIMD128-NEXT:    i32.const $push25=, 10
-; NO-SIMD128-NEXT:    i32.add $push26=, $0, $pop25
-; NO-SIMD128-NEXT:    i32.extend8_s $push23=, $14
-; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $30
-; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
-; NO-SIMD128-NEXT:    i32.store16 0($pop26), $pop24
-; NO-SIMD128-NEXT:    i32.const $push30=, 6
-; NO-SIMD128-NEXT:    i32.add $push31=, $0, $pop30
-; NO-SIMD128-NEXT:    i32.extend8_s $push28=, $12
-; NO-SIMD128-NEXT:    i32.extend8_s $push27=, $28
-; NO-SIMD128-NEXT:    i32.mul $push29=, $pop28, $pop27
-; NO-SIMD128-NEXT:    i32.store16 0($pop31), $pop29
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.extend8_s $push16=, $11
+; NO-SIMD128-NEXT:    i32.extend8_s $push15=, $27
+; NO-SIMD128-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.extend8_s $push19=, $10
+; NO-SIMD128-NEXT:    i32.extend8_s $push18=, $26
+; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.extend8_s $push22=, $9
+; NO-SIMD128-NEXT:    i32.extend8_s $push21=, $25
+; NO-SIMD128-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v8i16:
@@ -9629,34 +8045,26 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push6=, $27
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $12
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push11=, $28
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop10), $pop13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $13
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push14=, $29
-; NO-SIMD128-FAST-NEXT:    i32.mul $push16=, $pop15, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push20=, $14
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $30
-; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop18), $pop21
-; NO-SIMD128-FAST-NEXT:    i32.const $push22=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push23=, $0, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push25=, $15
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push24=, $31
-; NO-SIMD128-FAST-NEXT:    i32.mul $push26=, $pop25, $pop24
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop23), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push28=, $0, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push30=, $16
-; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push29=, $32
-; NO-SIMD128-FAST-NEXT:    i32.mul $push31=, $pop30, $pop29
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop28), $pop31
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push10=, $12
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push9=, $28
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push13=, $13
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push12=, $29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push14=, $pop13, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push16=, $14
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push15=, $30
+; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push19=, $15
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push18=, $31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push22=, $16
+; NO-SIMD128-FAST-NEXT:    i32.extend8_s $push21=, $32
+; NO-SIMD128-FAST-NEXT:    i32.mul $push23=, $pop22, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9687,61 +8095,53 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $21, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $19, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $18, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $17, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $8, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $24, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $24, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $7, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $23, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $23, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $6, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $22, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $22, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $4, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $20, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $21, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $4, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $20, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $3, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $19, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $2, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $18, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $1, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $17, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v8i16:
@@ -9749,60 +8149,52 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $17, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $5, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $21, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $6, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $22, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $18, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $7, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $23, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $19, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $24, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $20, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $5, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $21, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $6, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $22, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $7, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $23, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $8, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $24, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9833,61 +8225,53 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 255
-; NO-SIMD128-NEXT:    i32.and $push2=, $13, $pop0
-; NO-SIMD128-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-NEXT:    i32.and $push1=, $29, $pop47
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-NEXT:    i32.and $push5=, $11, $pop46
-; NO-SIMD128-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-NEXT:    i32.and $push4=, $27, $pop45
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-NEXT:    i32.and $push8=, $10, $pop44
-; NO-SIMD128-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-NEXT:    i32.and $push7=, $26, $pop43
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-NEXT:    i32.and $push11=, $9, $pop42
-; NO-SIMD128-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-NEXT:    i32.and $push10=, $25, $pop41
-; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push16=, 14
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-NEXT:    i32.and $push14=, $16, $pop40
+; NO-SIMD128-NEXT:    i32.and $push2=, $16, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-NEXT:    i32.and $push13=, $32, $pop39
-; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
-; NO-SIMD128-NEXT:    i32.store16 0($pop17), $pop15
-; NO-SIMD128-NEXT:    i32.const $push21=, 12
-; NO-SIMD128-NEXT:    i32.add $push22=, $0, $pop21
+; NO-SIMD128-NEXT:    i32.and $push1=, $32, $pop39
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-NEXT:    i32.and $push19=, $15, $pop38
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $pop38
 ; NO-SIMD128-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-NEXT:    i32.and $push18=, $31, $pop37
-; NO-SIMD128-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-NEXT:    i32.const $push26=, 10
-; NO-SIMD128-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-NEXT:    i32.and $push4=, $31, $pop37
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-NEXT:    i32.and $push24=, $14, $pop36
+; NO-SIMD128-NEXT:    i32.and $push8=, $14, $pop36
 ; NO-SIMD128-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-NEXT:    i32.and $push23=, $30, $pop35
-; NO-SIMD128-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-NEXT:    i32.const $push31=, 6
-; NO-SIMD128-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-NEXT:    i32.and $push7=, $30, $pop35
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop9
 ; NO-SIMD128-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-NEXT:    i32.and $push29=, $12, $pop34
+; NO-SIMD128-NEXT:    i32.and $push11=, $13, $pop34
 ; NO-SIMD128-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-NEXT:    i32.and $push28=, $28, $pop33
-; NO-SIMD128-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-NEXT:    i32.and $push10=, $29, $pop33
+; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-NEXT:    i32.and $push14=, $12, $pop32
+; NO-SIMD128-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-NEXT:    i32.and $push13=, $28, $pop31
+; NO-SIMD128-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop15
+; NO-SIMD128-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-NEXT:    i32.and $push17=, $11, $pop30
+; NO-SIMD128-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-NEXT:    i32.and $push16=, $27, $pop29
+; NO-SIMD128-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop18
+; NO-SIMD128-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-NEXT:    i32.and $push20=, $10, $pop28
+; NO-SIMD128-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-NEXT:    i32.and $push19=, $26, $pop27
+; NO-SIMD128-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop21
+; NO-SIMD128-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-NEXT:    i32.and $push23=, $9, $pop26
+; NO-SIMD128-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-NEXT:    i32.and $push22=, $25, $pop25
+; NO-SIMD128-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop24
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v8i16:
@@ -9895,60 +8279,52 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 255
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $9, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push47=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $25, $pop39
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push46=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop46
-; NO-SIMD128-FAST-NEXT:    i32.const $push45=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop45
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push44=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop44
-; NO-SIMD128-FAST-NEXT:    i32.const $push43=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 6
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.const $push42=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop42
-; NO-SIMD128-FAST-NEXT:    i32.const $push41=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop41
-; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop14), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push40=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $13, $pop40
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $29, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.mul $push17=, $pop16, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 10
-; NO-SIMD128-FAST-NEXT:    i32.add $push22=, $0, $pop21
 ; NO-SIMD128-FAST-NEXT:    i32.const $push38=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $14, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $pop38
 ; NO-SIMD128-FAST-NEXT:    i32.const $push37=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push18=, $30, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.mul $push20=, $pop19, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop22), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push27=, $0, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $26, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push36=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push24=, $15, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $11, $pop36
 ; NO-SIMD128-FAST-NEXT:    i32.const $push35=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $31, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.mul $push25=, $pop24, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop27), $pop25
-; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 14
-; NO-SIMD128-FAST-NEXT:    i32.add $push32=, $0, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $27, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.const $push34=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $12, $pop34
 ; NO-SIMD128-FAST-NEXT:    i32.const $push33=, 255
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $32, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.mul $push30=, $pop29, $pop28
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($pop32), $pop30
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $28, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push32=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push14=, $13, $pop32
+; NO-SIMD128-FAST-NEXT:    i32.const $push31=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.mul $push15=, $pop14, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.const $push30=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $14, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.const $push29=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $30, $pop29
+; NO-SIMD128-FAST-NEXT:    i32.mul $push18=, $pop17, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop18
+; NO-SIMD128-FAST-NEXT:    i32.const $push28=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push20=, $15, $pop28
+; NO-SIMD128-FAST-NEXT:    i32.const $push27=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $31, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.mul $push21=, $pop20, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push26=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $16, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.const $push25=, 255
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $32, $pop25
+; NO-SIMD128-FAST-NEXT:    i32.mul $push24=, $pop23, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop24
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <16 x i8> %v1, <16 x i8> undef,
            <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -9979,16 +8355,14 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: add_v4i32:
 ; NO-SIMD128:         .functype add_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4i32:
@@ -10000,10 +8374,8 @@ define <4 x i32> @add_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = add <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10025,16 +8397,14 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: sub_v4i32:
 ; NO-SIMD128:         .functype sub_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4i32:
@@ -10046,10 +8416,8 @@ define <4 x i32> @sub_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10071,16 +8439,14 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: mul_v4i32:
 ; NO-SIMD128:         .functype mul_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4i32:
@@ -10092,10 +8458,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = mul <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10117,20 +8481,18 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_s_v4i32:
 ; NO-SIMD128:         .functype min_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_s_v4i32:
@@ -10145,11 +8507,9 @@ define <4 x i32> @min_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp slt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10172,20 +8532,18 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: min_u_v4i32:
 ; NO-SIMD128:         .functype min_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.lt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.lt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.lt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.lt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.lt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.lt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.lt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.lt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_u_v4i32:
@@ -10200,11 +8558,9 @@ define <4 x i32> @min_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.lt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ult <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10227,20 +8583,18 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_s_v4i32:
 ; NO-SIMD128:         .functype max_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_s $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_s $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_s $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_s $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_s $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_s $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_s_v4i32:
@@ -10255,11 +8609,9 @@ define <4 x i32> @max_s_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_s $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp sgt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10282,20 +8634,18 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: max_u_v4i32:
 ; NO-SIMD128:         .functype max_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.gt_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.select $push1=, $3, $7, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.gt_u $push2=, $2, $6
-; NO-SIMD128-NEXT:    i32.select $push3=, $2, $6, $pop2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    i32.gt_u $push4=, $1, $5
-; NO-SIMD128-NEXT:    i32.select $push5=, $1, $5, $pop4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.gt_u $push6=, $4, $8
-; NO-SIMD128-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    i32.gt_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.select $push1=, $4, $8, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    i32.gt_u $push2=, $3, $7
+; NO-SIMD128-NEXT:    i32.select $push3=, $3, $7, $pop2
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    i32.gt_u $push4=, $2, $6
+; NO-SIMD128-NEXT:    i32.select $push5=, $2, $6, $pop4
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    i32.gt_u $push6=, $1, $5
+; NO-SIMD128-NEXT:    i32.select $push7=, $1, $5, $pop6
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_u_v4i32:
@@ -10310,11 +8660,9 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.select $push5=, $3, $7, $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.gt_u $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push7=, $4, $8, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = icmp ugt <4 x i32> %x, %y
   %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
@@ -10337,63 +8685,59 @@ define <4 x i32> @abs_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-LABEL: abs_v4i32:
 ; NO-SIMD128:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push21=, $4, $pop0
-; NO-SIMD128-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop20
+; NO-SIMD128-NEXT:    i32.shr_s $push19=, $4, $pop0
+; NO-SIMD128-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop18
 ; NO-SIMD128-NEXT:    i32.sub $push2=, $pop1, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push18=, $3, $pop19
-; NO-SIMD128-NEXT:    local.tee $push17=, $4=, $pop18
-; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push16=, $3, $pop17
+; NO-SIMD128-NEXT:    local.tee $push15=, $4=, $pop16
+; NO-SIMD128-NEXT:    i32.xor $push3=, $3, $pop15
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop3, $4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push13=, $2, $pop14
+; NO-SIMD128-NEXT:    local.tee $push12=, $4=, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push5=, $2, $pop12
 ; NO-SIMD128-NEXT:    i32.sub $push6=, $pop5, $4
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push15=, $2, $pop16
-; NO-SIMD128-NEXT:    local.tee $push14=, $4=, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push7=, $2, $pop14
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-NEXT:    i32.shr_s $push10=, $1, $pop11
+; NO-SIMD128-NEXT:    local.tee $push9=, $4=, $pop10
+; NO-SIMD128-NEXT:    i32.xor $push7=, $1, $pop9
 ; NO-SIMD128-NEXT:    i32.sub $push8=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-NEXT:    i32.shr_s $push12=, $1, $pop13
-; NO-SIMD128-NEXT:    local.tee $push11=, $4=, $pop12
-; NO-SIMD128-NEXT:    i32.xor $push9=, $1, $pop11
-; NO-SIMD128-NEXT:    i32.sub $push10=, $pop9, $4
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop10
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4i32:
 ; NO-SIMD128-FAST:         .functype abs_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push21=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    local.tee $push20=, $5=, $pop21
-; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push19=, $1, $pop0
+; NO-SIMD128-FAST-NEXT:    local.tee $push18=, $5=, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push18=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    local.tee $push17=, $1=, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push16=, $2, $pop17
+; NO-SIMD128-FAST-NEXT:    local.tee $push15=, $1=, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $2, $pop15
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop3, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push15=, $3, $pop16
-; NO-SIMD128-FAST-NEXT:    local.tee $push14=, $2=, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push13=, $3, $pop14
+; NO-SIMD128-FAST-NEXT:    local.tee $push12=, $2=, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $3, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop5, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 31
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push12=, $4, $pop13
-; NO-SIMD128-FAST-NEXT:    local.tee $push11=, $0=, $pop12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, 31
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push10=, $4, $pop11
+; NO-SIMD128-FAST-NEXT:    local.tee $push9=, $3=, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.sub $push8=, $pop7, $3
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> zeroinitializer, %x
   %b = icmp slt <4 x i32> %x, zeroinitializer
@@ -10418,19 +8762,17 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype neg_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 0
-; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $3
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-NEXT:    i32.sub $push2=, $pop9, $2
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-NEXT:    i32.sub $push3=, $pop8, $1
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.sub $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-NEXT:    i32.sub $push4=, $pop7, $4
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.sub $push2=, $pop7, $3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-NEXT:    i32.sub $push3=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-NEXT:    i32.sub $push4=, $pop5, $1
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4i32:
@@ -10439,17 +8781,15 @@ define <4 x i32> @neg_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 0
 ; NO-SIMD128-FAST-NEXT:    i32.sub $push1=, $pop0, $1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop9, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push2=, $pop7, $2
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop8, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push3=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 0
-; NO-SIMD128-FAST-NEXT:    i32.sub $push6=, $pop7, $4
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 0
+; NO-SIMD128-FAST-NEXT:    i32.sub $push4=, $pop5, $4
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = sub <4 x i32> <i32 0, i32 0, i32 0, i32 0>, %x
   ret <4 x i32> %a
@@ -10471,16 +8811,14 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shl_v4i32:
 ; NO-SIMD128:         .functype shl_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_v4i32:
@@ -10492,10 +8830,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10523,19 +8859,17 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128:         .functype shl_const_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 5
-; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.shl $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-NEXT:    i32.shl $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.shl $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-NEXT:    i32.shl $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-NEXT:    i32.shl $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_const_v4i32:
@@ -10544,17 +8878,15 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 5
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 5
-; NO-SIMD128-FAST-NEXT:    i32.shl $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %a
@@ -10606,16 +8938,14 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shl_vec_v4i32:
 ; NO-SIMD128:         .functype shl_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shl $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shl $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shl $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shl $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shl $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shl $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shl $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shl $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shl_vec_v4i32:
@@ -10627,10 +8957,8 @@ define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shl $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shl $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shl $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = shl <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10652,16 +8980,14 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_s_v4i32:
 ; NO-SIMD128:         .functype shr_s_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_v4i32:
@@ -10673,10 +8999,8 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10731,16 +9055,14 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_s_vec_v4i32:
 ; NO-SIMD128:         .functype shr_s_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_s $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_s $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_s $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_s $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_s $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_s $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_s $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_s $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_s_vec_v4i32:
@@ -10752,10 +9074,8 @@ define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_s $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_s $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_s $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = ashr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10777,16 +9097,14 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: shr_u_v4i32:
 ; NO-SIMD128:         .functype shr_u_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $5
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $5
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $5
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_v4i32:
@@ -10798,10 +9116,8 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $5
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %t = insertelement <4 x i32> undef, i32 %x, i32 0
   %s = shufflevector <4 x i32> %t, <4 x i32> undef,
@@ -10856,16 +9172,14 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-LABEL: shr_u_vec_v4i32:
 ; NO-SIMD128:         .functype shr_u_vec_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.shr_u $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.shr_u $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.shr_u $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.shr_u $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.shr_u $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.shr_u $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.shr_u $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.shr_u $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: shr_u_vec_v4i32:
@@ -10877,10 +9191,8 @@ define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.shr_u $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.shr_u $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.shr_u $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = lshr <4 x i32> %v, %x
   ret <4 x i32> %a
@@ -10902,16 +9214,14 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: and_v4i32:
 ; NO-SIMD128:         .functype and_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.and $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.and $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.and $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.and $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.and $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.and $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: and_v4i32:
@@ -10923,10 +9233,8 @@ define <4 x i32> @and_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = and <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10948,16 +9256,14 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: or_v4i32:
 ; NO-SIMD128:         .functype or_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.or $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.or $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.or $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.or $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.or $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.or $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.or $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.or $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: or_v4i32:
@@ -10969,10 +9275,8 @@ define <4 x i32> @or_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.or $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.or $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.or $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = or <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -10994,16 +9298,14 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: xor_v4i32:
 ; NO-SIMD128:         .functype xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $3, $7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    i32.xor $push1=, $2, $6
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $1, $5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.xor $push3=, $4, $8
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    i32.xor $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: xor_v4i32:
@@ -11015,10 +9317,8 @@ define <4 x i32> @xor_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, %y
   ret <4 x i32> %a
@@ -11041,19 +9341,17 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128:         .functype not_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $2, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $1, $pop8
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-NEXT:    i32.xor $push4=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $3, $pop7
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-NEXT:    i32.xor $push3=, $2, $pop6
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-NEXT:    i32.xor $push4=, $1, $pop5
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: not_v4i32:
@@ -11062,17 +9360,15 @@ define <4 x i32> @not_v4i32(<4 x i32> %x) {
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push6=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    i32.const $push5=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %a
@@ -11096,23 +9392,21 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128:         .functype andnot_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
-; NO-SIMD128-NEXT:    i32.xor $push1=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-NEXT:    i32.xor $push3=, $6, $pop13
-; NO-SIMD128-NEXT:    i32.and $push4=, $2, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-NEXT:    i32.xor $push5=, $5, $pop12
-; NO-SIMD128-NEXT:    i32.and $push6=, $1, $pop5
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push1=, $8, $pop0
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $8, $pop11
-; NO-SIMD128-NEXT:    i32.and $push8=, $4, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $pop11
+; NO-SIMD128-NEXT:    i32.and $push4=, $3, $pop3
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-NEXT:    i32.xor $push5=, $6, $pop10
+; NO-SIMD128-NEXT:    i32.and $push6=, $2, $pop5
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-NEXT:    i32.xor $push7=, $5, $pop9
+; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop7
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: andnot_v4i32:
@@ -11122,20 +9416,18 @@ define <4 x i32> @andnot_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push1=, $5, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $pop11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $2, $pop3
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push12=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push10=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $7, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push6=, $3, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push7=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push8=, $0, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.const $push11=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $4, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop8), $pop10
+; NO-SIMD128-FAST-NEXT:    i32.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push7=, $8, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
  %inv_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %x, %inv_y
@@ -11161,32 +9453,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.const $push1=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
 ; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
 ; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push8=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $11
-; NO-SIMD128-NEXT:    i32.and $push7=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push10=, $pop9, $pop7
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push12=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $10
-; NO-SIMD128-NEXT:    i32.and $push11=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push14=, $pop13, $pop11
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push16=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $9
-; NO-SIMD128-NEXT:    i32.and $push15=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push18=, $pop17, $pop15
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
@@ -11198,26 +9488,24 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
 ; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
 ; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
 ; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
 ; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
 ; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -11244,24 +9532,22 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-LABEL: bitselect_xor_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
 ; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
 ; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop4), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push5=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push6=, $pop5, $3
-; NO-SIMD128-NEXT:    i32.xor $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
-; NO-SIMD128-NEXT:    i32.xor $push8=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $2
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push11=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push12=, $pop11, $1
-; NO-SIMD128-NEXT:    i32.xor $push13=, $pop12, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop13
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_v4i32:
@@ -11279,12 +9565,10 @@ define <4 x i32> @bitselect_xor_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push12=, $pop11, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $pop12, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %and = and <4 x i32> %xor1, %c
@@ -11311,32 +9595,30 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-LABEL: bitselect_xor_reversed_v4i32:
 ; NO-SIMD128:         .functype bitselect_xor_reversed_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
 ; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $12
 ; NO-SIMD128-NEXT:    i32.const $push0=, -1
 ; NO-SIMD128-NEXT:    i32.xor $push1=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $pop1
 ; NO-SIMD128-NEXT:    i32.xor $push4=, $pop3, $12
-; NO-SIMD128-NEXT:    i32.store 0($pop6), $pop4
-; NO-SIMD128-NEXT:    i32.xor $push8=, $7, $11
-; NO-SIMD128-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-NEXT:    i32.xor $push7=, $3, $pop21
-; NO-SIMD128-NEXT:    i32.and $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.xor $push10=, $pop9, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop10
-; NO-SIMD128-NEXT:    i32.xor $push12=, $6, $10
-; NO-SIMD128-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $2, $pop20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $pop11
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push16=, $5, $9
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $11
 ; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push15=, $1, $pop19
-; NO-SIMD128-NEXT:    i32.and $push17=, $pop16, $pop15
-; NO-SIMD128-NEXT:    i32.xor $push18=, $pop17, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop18
+; NO-SIMD128-NEXT:    i32.xor $push5=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $pop5
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $10
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push9=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $pop9
+; NO-SIMD128-NEXT:    i32.xor $push12=, $pop11, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $9
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push13=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $pop13
+; NO-SIMD128-NEXT:    i32.xor $push16=, $pop15, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_xor_reversed_v4i32:
@@ -11349,25 +9631,23 @@ define <4 x i32> @bitselect_xor_reversed_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push4=, $pop3, $9
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $2, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $pop5
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $10
 ; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop20
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $3, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $pop9
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $pop11, $11
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push18=, $0, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push13=, $4, $pop17
 ; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.xor $push16=, $pop15, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop18), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
  %xor1 = xor <4 x i32> %v1, %v2
  %notc = xor <4 x i32> %c, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -11394,24 +9674,22 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_low_s_v4i32:
 ; NO-SIMD128:         .functype extmul_low_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $3
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $11
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $4
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $12
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $2
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $10
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $3
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $11
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $1
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $9
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $2
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $10
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $4
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $1
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $9
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_s_v4i32:
@@ -11429,12 +9707,10 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $11
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $4
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $12
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $4
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $12
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11464,24 +9740,22 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: extmul_high_s_v4i32:
 ; NO-SIMD128:         .functype extmul_high_s_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $7
-; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $15
+; NO-SIMD128-NEXT:    i32.extend16_s $push1=, $8
+; NO-SIMD128-NEXT:    i32.extend16_s $push0=, $16
 ; NO-SIMD128-NEXT:    i32.mul $push2=, $pop1, $pop0
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $6
-; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $14
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.extend16_s $push4=, $7
+; NO-SIMD128-NEXT:    i32.extend16_s $push3=, $15
 ; NO-SIMD128-NEXT:    i32.mul $push5=, $pop4, $pop3
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $5
-; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $13
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.extend16_s $push7=, $6
+; NO-SIMD128-NEXT:    i32.extend16_s $push6=, $14
 ; NO-SIMD128-NEXT:    i32.mul $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push12=, 12
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $8
-; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.extend16_s $push10=, $5
+; NO-SIMD128-NEXT:    i32.extend16_s $push9=, $13
 ; NO-SIMD128-NEXT:    i32.mul $push11=, $pop10, $pop9
-; NO-SIMD128-NEXT:    i32.store 0($pop13), $pop11
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_s_v4i32:
@@ -11499,12 +9773,10 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push6=, $15
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push8=, $pop7, $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push12=, $8
-; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push11=, $16
-; NO-SIMD128-FAST-NEXT:    i32.mul $push13=, $pop12, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop10), $pop13
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push10=, $8
+; NO-SIMD128-FAST-NEXT:    i32.extend16_s $push9=, $16
+; NO-SIMD128-FAST-NEXT:    i32.mul $push11=, $pop10, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -11535,31 +9807,29 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_low_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $3, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $11, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $2, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $4, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $12, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $1, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $9, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $11, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $2, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $10, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $1, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $9, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_low_u_v4i32:
@@ -11567,30 +9837,28 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $1, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $9, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $10, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $3, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $11, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $4, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $12, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %low1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -11621,31 +9889,29 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128:         .functype extmul_high_u_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    i32.const $push0=, 65535
-; NO-SIMD128-NEXT:    i32.and $push2=, $7, $pop0
-; NO-SIMD128-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-NEXT:    i32.and $push1=, $15, $pop21
-; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-NEXT:    i32.and $push5=, $6, $pop20
+; NO-SIMD128-NEXT:    i32.and $push2=, $8, $pop0
 ; NO-SIMD128-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    i32.and $push1=, $16, $pop19
+; NO-SIMD128-NEXT:    i32.mul $push3=, $pop2, $pop1
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
 ; NO-SIMD128-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-NEXT:    i32.and $push8=, $5, $pop18
+; NO-SIMD128-NEXT:    i32.and $push5=, $7, $pop18
 ; NO-SIMD128-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-NEXT:    i32.and $push7=, $13, $pop17
-; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop9
-; NO-SIMD128-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-NEXT:    i32.and $push4=, $15, $pop17
+; NO-SIMD128-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop6
 ; NO-SIMD128-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-NEXT:    i32.and $push8=, $6, $pop16
 ; NO-SIMD128-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-NEXT:    i32.and $push7=, $14, $pop15
+; NO-SIMD128-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop9
+; NO-SIMD128-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-NEXT:    i32.and $push11=, $5, $pop14
+; NO-SIMD128-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-NEXT:    i32.and $push10=, $13, $pop13
 ; NO-SIMD128-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop12
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: extmul_high_u_v4i32:
@@ -11653,30 +9919,28 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    i32.const $push0=, 65535
 ; NO-SIMD128-FAST-NEXT:    i32.and $push2=, $5, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.const $push21=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop21
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $13, $pop19
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push3=, $pop2, $pop1
 ; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push20=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop20
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push18=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $6, $pop18
 ; NO-SIMD128-FAST-NEXT:    i32.const $push17=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push14=, $0, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $14, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.mul $push6=, $pop5, $pop4
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop6
 ; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push8=, $7, $pop16
 ; NO-SIMD128-FAST-NEXT:    i32.const $push15=, 65535
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $15, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.mul $push9=, $pop8, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop9
+; NO-SIMD128-FAST-NEXT:    i32.const $push14=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $8, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.const $push13=, 65535
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $16, $pop13
 ; NO-SIMD128-FAST-NEXT:    i32.mul $push12=, $pop11, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop14), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop12
 ; NO-SIMD128-FAST-NEXT:    return
   %high1 = shufflevector <8 x i16> %v1, <8 x i16> undef,
            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -13061,16 +11325,14 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: neg_v4f32:
 ; NO-SIMD128:         .functype neg_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.neg $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.neg $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.neg $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.neg $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.neg $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.neg $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.neg $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: neg_v4f32:
@@ -13082,10 +11344,8 @@ define <4 x float> @neg_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.neg $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.neg $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.neg $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub nsz <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, %x
   ret <4 x float> %a
@@ -13108,16 +11368,14 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: abs_v4f32:
 ; NO-SIMD128:         .functype abs_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.abs $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.abs $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.abs $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.abs $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.abs $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.abs $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.abs $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: abs_v4f32:
@@ -13129,10 +11387,8 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.abs $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.abs $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.abs $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
   ret <4 x float> %a
@@ -13157,54 +11413,50 @@ define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.gt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.gt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.gt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.gt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.gt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.gt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.gt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13231,54 +11483,50 @@ define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.lt $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.lt $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.lt $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.lt $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_unordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_unordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.lt $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.lt $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.lt $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
   %a = select <4 x i1> %cmps, <4 x float> %x,
@@ -13305,54 +11553,50 @@ define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.ge $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.ge $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.ge $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.ge $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype min_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.ge $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.ge $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.ge $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13379,54 +11623,50 @@ define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
 ; NO-SIMD128:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push1=, $3, $pop17
-; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $3, $pop1
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push16=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.le $push1=, $4, $pop15
+; NO-SIMD128-NEXT:    f32.select $push2=, $pop0, $4, $pop1
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop2
 ; NO-SIMD128-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push5=, $1, $pop13
-; NO-SIMD128-NEXT:    f32.select $push6=, $pop14, $1, $pop5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop6
-; NO-SIMD128-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-NEXT:    f32.le $push3=, $3, $pop13
+; NO-SIMD128-NEXT:    f32.select $push4=, $pop14, $3, $pop3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop4
 ; NO-SIMD128-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-NEXT:    f32.le $push5=, $2, $pop11
+; NO-SIMD128-NEXT:    f32.select $push6=, $pop12, $2, $pop5
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop6
+; NO-SIMD128-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-NEXT:    f32.le $push7=, $1, $pop9
+; NO-SIMD128-NEXT:    f32.select $push8=, $pop10, $1, $pop7
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_ordered_v4f32:
 ; NO-SIMD128-FAST:         .functype max_ordered_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push17=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop17
+; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push1=, $1, $pop15
 ; NO-SIMD128-FAST-NEXT:    f32.select $push2=, $pop0, $1, $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push16=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.const $push15=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop15
-; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop16, $2, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push14=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push13=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop13
-; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop14, $3, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
-; NO-SIMD128-FAST-NEXT:    i32.const $push9=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push10=, $0, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.le $push3=, $2, $pop13
+; NO-SIMD128-FAST-NEXT:    f32.select $push4=, $pop14, $2, $pop3
+; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.const $push12=, 0x1.4p2
 ; NO-SIMD128-FAST-NEXT:    f32.const $push11=, 0x1.4p2
-; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop11
-; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop12, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop10), $pop8
+; NO-SIMD128-FAST-NEXT:    f32.le $push5=, $3, $pop11
+; NO-SIMD128-FAST-NEXT:    f32.select $push6=, $pop12, $3, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push10=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.const $push9=, 0x1.4p2
+; NO-SIMD128-FAST-NEXT:    f32.le $push7=, $4, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.select $push8=, $pop10, $4, $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
   %a = select <4 x i1> %cmps,
@@ -13451,16 +11691,14 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: min_intrinsic_v4f32:
 ; NO-SIMD128:         .functype min_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.min $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.min $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.min $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.min $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.min $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.min $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.min $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.min $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: min_intrinsic_v4f32:
@@ -13472,10 +11710,8 @@ define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.min $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.min $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.min $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13552,16 +11788,14 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32:
@@ -13573,10 +11807,8 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13598,16 +11830,14 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fminf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fminf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fminf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32:
@@ -13619,10 +11849,8 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13647,19 +11875,17 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push2=, fminf, $2, $pop9
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $1, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
 ; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
+; NO-SIMD128-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_non_zero_intrinsic:
@@ -13668,17 +11894,15 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push0=, -0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push1=, fminf, $1, $pop0
 ; NO-SIMD128-FAST-NEXT:    f32.store 0($0), $pop1
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push2=, fminf, $2, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop2
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop8
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $3, $pop6
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop3
-; NO-SIMD128-FAST-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push6=, fminf, $4, $pop7
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop5), $pop6
+; NO-SIMD128-FAST-NEXT:    f32.const $push5=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $4, $pop5
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop4
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float -1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13755,19 +11979,17 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128:         .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fminf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fminf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fminf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fminf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fminf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fminf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fminf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic:
@@ -13779,14 +12001,12 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fminf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fminf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fminf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fminf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -13809,16 +12029,14 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: max_intrinsic_v4f32:
 ; NO-SIMD128:         .functype max_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.max $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.max $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.max $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.max $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.max $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.max $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.max $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.max $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: max_intrinsic_v4f32:
@@ -13830,10 +12048,8 @@ define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.max $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.max $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.max $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13910,16 +12126,14 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32:
@@ -13931,10 +12145,8 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -13956,16 +12168,14 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32:
 ; NO-SIMD128:         .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    call $push0=, fmaxf, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    call $push2=, fmaxf, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    call $push0=, fmaxf, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    call $push3=, fmaxf, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32:
@@ -13977,10 +12187,8 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    call $push2=, fmaxf, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y)
   ret <4 x float> %a
@@ -14057,19 +12265,17 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x0p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x0p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32:
@@ -14081,14 +12287,12 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x0p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14113,19 +12317,17 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128:         .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
 ; NO-SIMD128-NEXT:    f32.const $push0=, -0x1p0
-; NO-SIMD128-NEXT:    call $push1=, fmaxf, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.const $push2=, 0x1p0
-; NO-SIMD128-NEXT:    call $push3=, fmaxf, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-NEXT:    call $push4=, fmaxf, $1, $pop9
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-NEXT:    call $push1=, fmaxf, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-NEXT:    call $push2=, fmaxf, $3, $pop7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop2
+; NO-SIMD128-NEXT:    f32.const $push3=, 0x1p0
+; NO-SIMD128-NEXT:    call $push4=, fmaxf, $2, $pop3
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop4
+; NO-SIMD128-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-NEXT:    call $push5=, fmaxf, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: maxnum_non_zero_intrinsic_v4f32:
@@ -14137,14 +12339,12 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float>
 ; NO-SIMD128-FAST-NEXT:    f32.const $push2=, 0x1p0
 ; NO-SIMD128-FAST-NEXT:    call $push3=, fmaxf, $2, $pop2
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-FAST-NEXT:    f32.const $push9=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop9
+; NO-SIMD128-FAST-NEXT:    f32.const $push7=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push4=, fmaxf, $3, $pop7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push5=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push6=, $0, $pop5
-; NO-SIMD128-FAST-NEXT:    f32.const $push8=, -0x1p0
-; NO-SIMD128-FAST-NEXT:    call $push7=, fmaxf, $4, $pop8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop6), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.const $push6=, -0x1p0
+; NO-SIMD128-FAST-NEXT:    call $push5=, fmaxf, $4, $pop6
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 1.0, float -1.0, float -1.0>)
   ret <4 x float> %a
@@ -14240,20 +12440,18 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmin_v4f32:
 ; NO-SIMD128:         .functype pmin_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $7, $3
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $6, $2
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $5, $1
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $8, $4
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $8, $4
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $7, $3
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $6, $2
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $5, $1
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_v4f32:
@@ -14268,11 +12466,9 @@ define <4 x float> @pmin_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $7, $3
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $8, $4
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %y, %x
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14295,28 +12491,26 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmin_int_v4f32:
 ; NO-SIMD128:         .functype pmin_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $8
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $4
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $7
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $3
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $6
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $2
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $5
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $1
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $3
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $6
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $2
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $5
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $1
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmin_int_v4f32:
@@ -14337,13 +12531,11 @@ define <4 x i32> @pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14368,20 +12560,18 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: pmax_v4f32:
 ; NO-SIMD128:         .functype pmax_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.lt $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.select $push1=, $7, $3, $pop0
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
-; NO-SIMD128-NEXT:    f32.lt $push2=, $2, $6
-; NO-SIMD128-NEXT:    f32.select $push3=, $6, $2, $pop2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop3
-; NO-SIMD128-NEXT:    f32.lt $push4=, $1, $5
-; NO-SIMD128-NEXT:    f32.select $push5=, $5, $1, $pop4
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop5
-; NO-SIMD128-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    f32.lt $push6=, $4, $8
-; NO-SIMD128-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-NEXT:    f32.lt $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.select $push1=, $8, $4, $pop0
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop1
+; NO-SIMD128-NEXT:    f32.lt $push2=, $3, $7
+; NO-SIMD128-NEXT:    f32.select $push3=, $7, $3, $pop2
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop3
+; NO-SIMD128-NEXT:    f32.lt $push4=, $2, $6
+; NO-SIMD128-NEXT:    f32.select $push5=, $6, $2, $pop4
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop5
+; NO-SIMD128-NEXT:    f32.lt $push6=, $1, $5
+; NO-SIMD128-NEXT:    f32.select $push7=, $5, $1, $pop6
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop7
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_v4f32:
@@ -14396,11 +12586,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push4=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.select $push5=, $7, $3, $pop4
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.const $push8=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push9=, $0, $pop8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push6=, $4, $8
 ; NO-SIMD128-FAST-NEXT:    f32.select $push7=, $8, $4, $pop6
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop9), $pop7
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop7
 ; NO-SIMD128-FAST-NEXT:    return
   %c = fcmp olt <4 x float> %x, %y
   %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
@@ -14423,28 +12611,26 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: pmax_int_v4f32:
 ; NO-SIMD128:         .functype pmax_int_v4f32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push1=, $4
 ; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push0=, $8
 ; NO-SIMD128-NEXT:    f32.lt $push2=, $pop1, $pop0
 ; NO-SIMD128-NEXT:    i32.select $push3=, $8, $4, $pop2
-; NO-SIMD128-NEXT:    i32.store 0($pop5), $pop3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push7=, $3
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push6=, $7
-; NO-SIMD128-NEXT:    f32.lt $push8=, $pop7, $pop6
-; NO-SIMD128-NEXT:    i32.select $push9=, $7, $3, $pop8
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop9
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push11=, $2
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push10=, $6
-; NO-SIMD128-NEXT:    f32.lt $push12=, $pop11, $pop10
-; NO-SIMD128-NEXT:    i32.select $push13=, $6, $2, $pop12
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop13
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push15=, $1
-; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push14=, $5
-; NO-SIMD128-NEXT:    f32.lt $push16=, $pop15, $pop14
-; NO-SIMD128-NEXT:    i32.select $push17=, $5, $1, $pop16
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop17
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push5=, $3
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push4=, $7
+; NO-SIMD128-NEXT:    f32.lt $push6=, $pop5, $pop4
+; NO-SIMD128-NEXT:    i32.select $push7=, $7, $3, $pop6
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop7
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push9=, $2
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push8=, $6
+; NO-SIMD128-NEXT:    f32.lt $push10=, $pop9, $pop8
+; NO-SIMD128-NEXT:    i32.select $push11=, $6, $2, $pop10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop11
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push13=, $1
+; NO-SIMD128-NEXT:    f32.reinterpret_i32 $push12=, $5
+; NO-SIMD128-NEXT:    f32.lt $push14=, $pop13, $pop12
+; NO-SIMD128-NEXT:    i32.select $push15=, $5, $1, $pop14
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop15
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: pmax_int_v4f32:
@@ -14465,13 +12651,11 @@ define <4 x i32> @pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push10=, $pop9, $pop8
 ; NO-SIMD128-FAST-NEXT:    i32.select $push11=, $7, $3, $pop10
 ; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.const $push16=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push17=, $0, $pop16
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push13=, $4
 ; NO-SIMD128-FAST-NEXT:    f32.reinterpret_i32 $push12=, $8
 ; NO-SIMD128-FAST-NEXT:    f32.lt $push14=, $pop13, $pop12
 ; NO-SIMD128-FAST-NEXT:    i32.select $push15=, $8, $4, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.store 0($pop17), $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop15
 ; NO-SIMD128-FAST-NEXT:    return
   %fx = bitcast <4 x i32> %x to <4 x float>
   %fy = bitcast <4 x i32> %y to <4 x float>
@@ -14496,16 +12680,14 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: add_v4f32:
 ; NO-SIMD128:         .functype add_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.add $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.add $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.add $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.add $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.add $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.add $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.add $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.add $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: add_v4f32:
@@ -14517,10 +12699,8 @@ define <4 x float> @add_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.add $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.add $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.add $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fadd <4 x float> %x, %y
   ret <4 x float> %a
@@ -14542,16 +12722,14 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: sub_v4f32:
 ; NO-SIMD128:         .functype sub_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sub $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sub $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sub $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.sub $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.sub $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sub $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sub $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sub $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sub_v4f32:
@@ -14563,10 +12741,8 @@ define <4 x float> @sub_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sub $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sub $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sub $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fsub <4 x float> %x, %y
   ret <4 x float> %a
@@ -14588,16 +12764,14 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: div_v4f32:
 ; NO-SIMD128:         .functype div_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.div $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.div $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.div $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.div $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.div $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.div $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.div $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.div $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: div_v4f32:
@@ -14609,10 +12783,8 @@ define <4 x float> @div_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.div $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.div $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.div $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fdiv <4 x float> %x, %y
   ret <4 x float> %a
@@ -14634,16 +12806,14 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: mul_v4f32:
 ; NO-SIMD128:         .functype mul_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.mul $push0=, $3, $7
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.mul $push1=, $2, $6
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.mul $push2=, $1, $5
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push4=, 12
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    f32.mul $push3=, $4, $8
-; NO-SIMD128-NEXT:    f32.store 0($pop5), $pop3
+; NO-SIMD128-NEXT:    f32.mul $push0=, $4, $8
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.mul $push1=, $3, $7
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.mul $push2=, $2, $6
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.mul $push3=, $1, $5
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: mul_v4f32:
@@ -14655,10 +12825,8 @@ define <4 x float> @mul_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.mul $push2=, $3, $7
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.mul $push5=, $4, $8
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.mul $push3=, $4, $8
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = fmul <4 x float> %x, %y
   ret <4 x float> %a
@@ -14681,16 +12849,14 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-LABEL: sqrt_v4f32:
 ; NO-SIMD128:         .functype sqrt_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    f32.sqrt $push0=, $3
-; NO-SIMD128-NEXT:    f32.store 8($0), $pop0
-; NO-SIMD128-NEXT:    f32.sqrt $push1=, $2
-; NO-SIMD128-NEXT:    f32.store 4($0), $pop1
-; NO-SIMD128-NEXT:    f32.sqrt $push2=, $1
-; NO-SIMD128-NEXT:    f32.store 0($0), $pop2
-; NO-SIMD128-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-NEXT:    f32.sqrt $push0=, $4
+; NO-SIMD128-NEXT:    f32.store 12($0), $pop0
+; NO-SIMD128-NEXT:    f32.sqrt $push1=, $3
+; NO-SIMD128-NEXT:    f32.store 8($0), $pop1
+; NO-SIMD128-NEXT:    f32.sqrt $push2=, $2
+; NO-SIMD128-NEXT:    f32.store 4($0), $pop2
+; NO-SIMD128-NEXT:    f32.sqrt $push3=, $1
+; NO-SIMD128-NEXT:    f32.store 0($0), $pop3
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: sqrt_v4f32:
@@ -14702,10 +12868,8 @@ define <4 x float> @sqrt_v4f32(<4 x float> %x) {
 ; NO-SIMD128-FAST-NEXT:    f32.store 4($0), $pop1
 ; NO-SIMD128-FAST-NEXT:    f32.sqrt $push2=, $3
 ; NO-SIMD128-FAST-NEXT:    f32.store 8($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.const $push3=, 12
-; NO-SIMD128-FAST-NEXT:    i32.add $push4=, $0, $pop3
-; NO-SIMD128-FAST-NEXT:    f32.sqrt $push5=, $4
-; NO-SIMD128-FAST-NEXT:    f32.store 0($pop4), $pop5
+; NO-SIMD128-FAST-NEXT:    f32.sqrt $push3=, $4
+; NO-SIMD128-FAST-NEXT:    f32.store 12($0), $pop3
 ; NO-SIMD128-FAST-NEXT:    return
   %a = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   ret <4 x float> %a
diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll
index d2a38de..5ec9f6a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd.ll
@@ -38,44 +38,22 @@ define <16 x i8> @splat_v16i8(i8 %x) {
 ; NO-SIMD128-LABEL: splat_v16i8:
 ; NO-SIMD128:         .functype splat_v16i8 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $1
+; NO-SIMD128-NEXT:    i32.store8 14($0), $1
+; NO-SIMD128-NEXT:    i32.store8 13($0), $1
+; NO-SIMD128-NEXT:    i32.store8 12($0), $1
+; NO-SIMD128-NEXT:    i32.store8 11($0), $1
+; NO-SIMD128-NEXT:    i32.store8 10($0), $1
+; NO-SIMD128-NEXT:    i32.store8 9($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $1
+; NO-SIMD128-NEXT:    i32.store8 7($0), $1
+; NO-SIMD128-NEXT:    i32.store8 6($0), $1
+; NO-SIMD128-NEXT:    i32.store8 5($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $1
+; NO-SIMD128-NEXT:    i32.store8 3($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $1
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $1
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $1
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $1
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $1
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $1
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $1
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $1
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <16 x i8> undef, i8 %x, i32 0
   %res = shufflevector <16 x i8> %v, <16 x i8> undef,
@@ -356,44 +334,22 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_v16i8:
 ; NO-SIMD128:         .functype replace_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $17
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $17
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 11
   ret <16 x i8> %res
@@ -461,44 +417,22 @@ define <16 x i8> @replace_zero_v16i8(<16 x i8> %v, i8 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v16i8:
 ; NO-SIMD128:         .functype replace_zero_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $17
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <16 x i8> %v, i8 %x, i32 0
   ret <16 x i8> %res
@@ -514,44 +448,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_v16i8:
 ; NO-SIMD128:         .functype shuffle_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $32
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $30
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $28
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $26
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $24
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $22
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $20
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $18
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $32
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $30
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $28
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $26
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $24
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $22
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $20
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23,
@@ -569,44 +481,22 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v16i8:
 ; NO-SIMD128:         .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $2
+; NO-SIMD128-NEXT:    i32.store8 14($0), $2
+; NO-SIMD128-NEXT:    i32.store8 13($0), $2
+; NO-SIMD128-NEXT:    i32.store8 12($0), $2
+; NO-SIMD128-NEXT:    i32.store8 11($0), $2
+; NO-SIMD128-NEXT:    i32.store8 10($0), $2
+; NO-SIMD128-NEXT:    i32.store8 9($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $2
+; NO-SIMD128-NEXT:    i32.store8 7($0), $2
+; NO-SIMD128-NEXT:    i32.store8 6($0), $2
+; NO-SIMD128-NEXT:    i32.store8 5($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $2
+; NO-SIMD128-NEXT:    i32.store8 3($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $2
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $2
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $2
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $2
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $2
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $2
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $2
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <16 x i8> %x, <16 x i8> %y,
     <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -641,44 +531,22 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3,
 ; NO-SIMD128-LABEL: build_v16i8:
 ; NO-SIMD128:         .functype build_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store8 15($0), $16
+; NO-SIMD128-NEXT:    i32.store8 14($0), $15
+; NO-SIMD128-NEXT:    i32.store8 13($0), $14
+; NO-SIMD128-NEXT:    i32.store8 12($0), $13
+; NO-SIMD128-NEXT:    i32.store8 11($0), $12
+; NO-SIMD128-NEXT:    i32.store8 10($0), $11
+; NO-SIMD128-NEXT:    i32.store8 9($0), $10
 ; NO-SIMD128-NEXT:    i32.store8 8($0), $9
+; NO-SIMD128-NEXT:    i32.store8 7($0), $8
+; NO-SIMD128-NEXT:    i32.store8 6($0), $7
+; NO-SIMD128-NEXT:    i32.store8 5($0), $6
 ; NO-SIMD128-NEXT:    i32.store8 4($0), $5
+; NO-SIMD128-NEXT:    i32.store8 3($0), $4
 ; NO-SIMD128-NEXT:    i32.store8 2($0), $3
 ; NO-SIMD128-NEXT:    i32.store8 1($0), $2
 ; NO-SIMD128-NEXT:    i32.store8 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 15
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store8 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 14
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store8 0($pop3), $15
-; NO-SIMD128-NEXT:    i32.const $push4=, 13
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store8 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 12
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store8 0($pop7), $13
-; NO-SIMD128-NEXT:    i32.const $push8=, 11
-; NO-SIMD128-NEXT:    i32.add $push9=, $0, $pop8
-; NO-SIMD128-NEXT:    i32.store8 0($pop9), $12
-; NO-SIMD128-NEXT:    i32.const $push10=, 10
-; NO-SIMD128-NEXT:    i32.add $push11=, $0, $pop10
-; NO-SIMD128-NEXT:    i32.store8 0($pop11), $11
-; NO-SIMD128-NEXT:    i32.const $push12=, 9
-; NO-SIMD128-NEXT:    i32.add $push13=, $0, $pop12
-; NO-SIMD128-NEXT:    i32.store8 0($pop13), $10
-; NO-SIMD128-NEXT:    i32.const $push14=, 7
-; NO-SIMD128-NEXT:    i32.add $push15=, $0, $pop14
-; NO-SIMD128-NEXT:    i32.store8 0($pop15), $8
-; NO-SIMD128-NEXT:    i32.const $push16=, 6
-; NO-SIMD128-NEXT:    i32.add $push17=, $0, $pop16
-; NO-SIMD128-NEXT:    i32.store8 0($pop17), $7
-; NO-SIMD128-NEXT:    i32.const $push18=, 5
-; NO-SIMD128-NEXT:    i32.add $push19=, $0, $pop18
-; NO-SIMD128-NEXT:    i32.store8 0($pop19), $6
-; NO-SIMD128-NEXT:    i32.const $push20=, 3
-; NO-SIMD128-NEXT:    i32.add $push21=, $0, $pop20
-; NO-SIMD128-NEXT:    i32.store8 0($pop21), $4
 ; NO-SIMD128-NEXT:    return
                               i8 %x4, i8 %x5, i8 %x6, i8 %x7,
                               i8 %x8, i8 %x9, i8 %x10, i8 %x11,
@@ -734,22 +602,14 @@ define <8 x i16> @splat_v8i16(i16 %x) {
 ; NO-SIMD128-LABEL: splat_v8i16:
 ; NO-SIMD128:         .functype splat_v8i16 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $1
+; NO-SIMD128-NEXT:    i32.store16 12($0), $1
+; NO-SIMD128-NEXT:    i32.store16 10($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $1
+; NO-SIMD128-NEXT:    i32.store16 6($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $1
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $1
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $1
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $1
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <8 x i16> undef, i16 %x, i32 0
   %res = shufflevector <8 x i16> %v, <8 x i16> undef,
@@ -1016,22 +876,14 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_v8i16:
 ; NO-SIMD128:         .functype replace_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $9
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $9
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 7
   ret <8 x i16> %res
@@ -1095,22 +947,14 @@ define <8 x i16> @replace_zero_v8i16(<8 x i16> %v, i16 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v8i16:
 ; NO-SIMD128:         .functype replace_zero_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $9
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <8 x i16> %v, i16 %x, i32 0
   ret <8 x i16> %res
@@ -1126,22 +970,14 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_v8i16:
 ; NO-SIMD128:         .functype shuffle_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $16
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $14
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $12
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $10
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $16
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $14
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $12
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -1158,22 +994,14 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v8i16:
 ; NO-SIMD128:         .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $2
+; NO-SIMD128-NEXT:    i32.store16 12($0), $2
+; NO-SIMD128-NEXT:    i32.store16 10($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $2
+; NO-SIMD128-NEXT:    i32.store16 6($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $2
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $2
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $2
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <8 x i16> %x, <8 x i16> %y,
     <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
@@ -1198,22 +1026,14 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3,
 ; NO-SIMD128-LABEL: build_v8i16:
 ; NO-SIMD128:         .functype build_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store16 14($0), $8
+; NO-SIMD128-NEXT:    i32.store16 12($0), $7
+; NO-SIMD128-NEXT:    i32.store16 10($0), $6
 ; NO-SIMD128-NEXT:    i32.store16 8($0), $5
+; NO-SIMD128-NEXT:    i32.store16 6($0), $4
 ; NO-SIMD128-NEXT:    i32.store16 4($0), $3
 ; NO-SIMD128-NEXT:    i32.store16 2($0), $2
 ; NO-SIMD128-NEXT:    i32.store16 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 14
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store16 0($pop1), $8
-; NO-SIMD128-NEXT:    i32.const $push2=, 12
-; NO-SIMD128-NEXT:    i32.add $push3=, $0, $pop2
-; NO-SIMD128-NEXT:    i32.store16 0($pop3), $7
-; NO-SIMD128-NEXT:    i32.const $push4=, 10
-; NO-SIMD128-NEXT:    i32.add $push5=, $0, $pop4
-; NO-SIMD128-NEXT:    i32.store16 0($pop5), $6
-; NO-SIMD128-NEXT:    i32.const $push6=, 6
-; NO-SIMD128-NEXT:    i32.add $push7=, $0, $pop6
-; NO-SIMD128-NEXT:    i32.store16 0($pop7), $4
 ; NO-SIMD128-NEXT:    return
                               i16 %x4, i16 %x5, i16 %x6, i16 %x7) {
   %t0 = insertelement <8 x i16> undef, i16 %x0, i32 0
@@ -1258,12 +1078,10 @@ define <4 x i32> @splat_v4i32(i32 %x) {
 ; NO-SIMD128-LABEL: splat_v4i32:
 ; NO-SIMD128:         .functype splat_v4i32 (i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $1
 ; NO-SIMD128-NEXT:    i32.store 8($0), $1
 ; NO-SIMD128-NEXT:    i32.store 4($0), $1
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x i32> undef, i32 %x, i32 0
   %res = shufflevector <4 x i32> %v, <4 x i32> undef,
@@ -1368,12 +1186,10 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_v4i32:
 ; NO-SIMD128:         .functype replace_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $5
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 2
   ret <4 x i32> %res
@@ -1433,12 +1249,10 @@ define <4 x i32> @replace_zero_v4i32(<4 x i32> %v, i32 %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4i32:
 ; NO-SIMD128:         .functype replace_zero_v4i32 (i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x i32> %v, i32 %x, i32 0
   ret <4 x i32> %res
@@ -1454,12 +1268,10 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4i32:
 ; NO-SIMD128:         .functype shuffle_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $8
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $6
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1476,12 +1288,10 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4i32:
 ; NO-SIMD128:         .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $2
 ; NO-SIMD128-NEXT:    i32.store 8($0), $2
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x i32> %x, <4 x i32> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -1501,12 +1311,10 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
 ; NO-SIMD128-LABEL: build_v4i32:
 ; NO-SIMD128:         .functype build_v4i32 (i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    i32.store 12($0), $4
 ; NO-SIMD128-NEXT:    i32.store 8($0), $3
 ; NO-SIMD128-NEXT:    i32.store 4($0), $2
 ; NO-SIMD128-NEXT:    i32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    i32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x i32> undef, i32 %x0, i32 0
   %t1 = insertelement <4 x i32> %t0, i32 %x1, i32 1
@@ -1801,12 +1609,10 @@ define <4 x float> @splat_v4f32(float %x) {
 ; NO-SIMD128-LABEL: splat_v4f32:
 ; NO-SIMD128:         .functype splat_v4f32 (i32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $1
 ; NO-SIMD128-NEXT:    f32.store 8($0), $1
 ; NO-SIMD128-NEXT:    f32.store 4($0), $1
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $1
 ; NO-SIMD128-NEXT:    return
   %v = insertelement <4 x float> undef, float %x, i32 0
   %res = shufflevector <4 x float> %v, <4 x float> undef,
@@ -1911,12 +1717,10 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_v4f32:
 ; NO-SIMD128:         .functype replace_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $5
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 2
   ret <4 x float> %res
@@ -1976,12 +1780,10 @@ define <4 x float> @replace_zero_v4f32(<4 x float> %v, float %x) {
 ; NO-SIMD128-LABEL: replace_zero_v4f32:
 ; NO-SIMD128:         .functype replace_zero_v4f32 (i32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $5
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %res = insertelement <4 x float> %v, float %x, i32 0
   ret <4 x float> %res
@@ -1997,12 +1799,10 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_v4f32:
 ; NO-SIMD128:         .functype shuffle_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $8
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $6
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $8
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -2019,12 +1819,10 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
 ; NO-SIMD128-LABEL: shuffle_undef_v4f32:
 ; NO-SIMD128:         .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $2
 ; NO-SIMD128-NEXT:    f32.store 8($0), $2
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $2
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $2
 ; NO-SIMD128-NEXT:    return
   %res = shufflevector <4 x float> %x, <4 x float> %y,
     <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -2044,12 +1842,10 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) {
 ; NO-SIMD128-LABEL: build_v4f32:
 ; NO-SIMD128:         .functype build_v4f32 (i32, f32, f32, f32, f32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
+; NO-SIMD128-NEXT:    f32.store 12($0), $4
 ; NO-SIMD128-NEXT:    f32.store 8($0), $3
 ; NO-SIMD128-NEXT:    f32.store 4($0), $2
 ; NO-SIMD128-NEXT:    f32.store 0($0), $1
-; NO-SIMD128-NEXT:    i32.const $push0=, 12
-; NO-SIMD128-NEXT:    i32.add $push1=, $0, $pop0
-; NO-SIMD128-NEXT:    f32.store 0($pop1), $4
 ; NO-SIMD128-NEXT:    return
   %t0 = insertelement <4 x float> undef, float %x0, i32 0
   %t1 = insertelement <4 x float> %t0, float %x1, i32 1
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 609be3b..50e736a 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
 
 ; Check that the shr(shl X, 56), 48) is not mistakenly turned into
@@ -16,11 +17,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: movsbq %dil, %rax
-; CHECK: shlq $8, %rax
-; CHECK: orq $1, %rax
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbq %dil, %rax
+; CHECK-NEXT:    shlq $8, %rax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
+entry:
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/AppendingLinkage.ll b/llvm/test/CodeGen/X86/AppendingLinkage.ll
index 83bfbe8..ace5d19 100644
--- a/llvm/test/CodeGen/X86/AppendingLinkage.ll
+++ b/llvm/test/CodeGen/X86/AppendingLinkage.ll
@@ -1,4 +1,4 @@
 ; RUN: not --crash llc < %s -mtriple=i686-- 2>&1 | FileCheck %s
 
-; CHECK: unknown special variable
+; CHECK: unknown special variable with appending linkage
 @foo = appending constant [1 x i32 ]zeroinitializer
diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll
index 7a8ddf5..cb2d426 100644
--- a/llvm/test/CodeGen/X86/combine-pavg.ll
+++ b/llvm/test/CodeGen/X86/combine-pavg.ll
@@ -84,25 +84,22 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
 define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: combine_pavgw_demandedelts:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
 ; SSE-NEXT:    pavgw %xmm1, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_pavgw_demandedelts:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,12,13,12,13]
 ; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_pavgw_demandedelts:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %s0 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   %avg = tail call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %s0, <8 x i16> %a1)
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index 548cf24..13c9585 100644
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -869,13 +869,13 @@ body: |
   $ymm0 = VSHUFPSZ256rmi                       $ymm0, $rdi, 1, $noreg, 0, $noreg, -24
   ; CHECK: $ymm0 = VSHUFPSYrri                 $ymm0, $ymm1, -24
   $ymm0 = VSHUFPSZ256rri                       $ymm0, $ymm1, -24
-  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYmi                 $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYri                 $ymm0, 15, implicit $mxcsr
   $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15, implicit $mxcsr
   ; CHECK: $ymm0 = VPERM2F128rm                $ymm0, $rip, 1, $noreg, 0, $noreg, 32
   $ymm0 = VSHUFF32X4Z256rmi                    $ymm0, $rip, 1, $noreg, 0, $noreg, 228
@@ -1751,13 +1751,13 @@ body: |
   $xmm0 = VALIGNQZ128rmi                       $xmm0, $rip, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VPALIGNRrri                 $xmm0, $xmm1, 8
   $xmm0 = VALIGNQZ128rri                       $xmm0, $xmm1, 1
-  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSmi                  $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSri                  $xmm0, 15, implicit $mxcsr
   $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15, implicit $mxcsr
 
   RET64
@@ -2308,21 +2308,21 @@ body: |
   $xmm0 = VINSERTPSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VINSERTPSrr                 $xmm0, $xmm0, 1
   $xmm0 = VINSERTPSZrr                         $xmm0, $xmm0, 1
-  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi                  $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri                  $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSmi_Int              $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $noreg, 0, $noreg, 15, implicit $mxcsr
-  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSri_Int              $xmm0, $xmm1, 15, implicit $mxcsr
   $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
 
   RET64
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index d9ee5f0..ee7f4ae 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -173,16 +173,14 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    vmovdqa (%edx), %xmm0
 ; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
-; X86-NEXT:    vpextrb $6, %xmm0, %ecx
-; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa (%rdi), %xmm0
 ; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
-; X64-NEXT:    vpextrb $6, %xmm0, %eax
-; X64-NEXT:    movb %al, (%rdx)
+; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
   %i1 = load <16 x i8>, ptr %origin1
diff --git a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
index 64d44d9..0123431 100644
--- a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
+++ b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
@@ -1,59 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=NUM
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s --check-prefix=SJLJ
 
-; NUM-COUNT-3: endbr64
-
-;SJLJ:       main:                                  # @main
-;SJLJ-NEXT: .Lfunc_begin0:
-;SJLJ-NEXT: # %bb.0:                                # %entry
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         pushq   %rbp
-;SJLJ:               callq   _Unwind_SjLj_Register
-;SJLJ-NEXT: .Ltmp0:
-;SJLJ-NEXT:         callq   _Z3foov
-;SJLJ-NEXT: .Ltmp1:
-;SJLJ-NEXT: # %bb.1:                                # %invoke.cont
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .LBB0_7:                                # %return
-;SJLJ:               callq   _Unwind_SjLj_Unregister
-;SJLJ:               retq
-;SJLJ-NEXT: .LBB0_9:
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jb      .LBB0_10
-;SJLJ-NEXT: # %bb.11:
-;SJLJ-NEXT:         ud2
-;SJLJ-NEXT: .LBB0_10:
-;SJLJ-NEXT:         leaq    .LJTI0_0(%rip), %rcx
-;SJLJ-NEXT:         jmpq    *(%rcx,%rax,8)
-;SJLJ-NEXT: .LBB0_2:                                # %lpad
-;SJLJ-NEXT: .Ltmp2:
-;SJLJ-NEXT:         endbr64
-;SJLJ:               jne     .LBB0_4
-;SJLJ-NEXT: # %bb.3:                                # %catch3
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               jmp     .LBB0_6
-;SJLJ-NEXT: .LBB0_4:                                # %catch.fallthrough
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jne     .LBB0_8
-;SJLJ-NEXT: # %bb.5:                                # %catch
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               cmpb
-;SJLJ-NEXT: .LBB0_6:                                # %return
-;SJLJ:               callq   __cxa_end_catch
-;SJLJ-NEXT:         jmp     .LBB0_7
-;SJLJ-NEXT: .LBB0_8:                                # %eh.resume
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .Lfunc_end0:
-;SJLJ:      .LJTI0_0:
-;SJLJ-NEXT:         .quad   .LBB0_2
-
 @_ZTIi = external dso_local constant ptr
 @_ZTIc = external dso_local constant ptr
 
 ; Function Attrs: noinline norecurse optnone uwtable
 define dso_local i32 @main() #0 personality ptr @__gxx_personality_sj0 {
+; NUM-LABEL: main:
+; NUM:       # %bb.0: # %entry
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    pushq %rbp
+; NUM-NEXT:    movq %rsp, %rbp
+; NUM-NEXT:    pushq %r15
+; NUM-NEXT:    pushq %r14
+; NUM-NEXT:    pushq %r13
+; NUM-NEXT:    pushq %r12
+; NUM-NEXT:    pushq %rbx
+; NUM-NEXT:    subq $120, %rsp
+; NUM-NEXT:    movl $0, -44(%rbp)
+; NUM-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; NUM-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; NUM-NEXT:    movq %rbp, -104(%rbp)
+; NUM-NEXT:    movq %rsp, -88(%rbp)
+; NUM-NEXT:    movq $.LBB0_9, -96(%rbp)
+; NUM-NEXT:    movl $1, -144(%rbp)
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Register@PLT
+; NUM-NEXT:  .Ltmp0:
+; NUM-NEXT:    callq _Z3foov
+; NUM-NEXT:  .Ltmp1:
+; NUM-NEXT:  # %bb.1: # %invoke.cont
+; NUM-NEXT:    movl $1, -44(%rbp)
+; NUM-NEXT:  .LBB0_7: # %return
+; NUM-NEXT:    movl -44(%rbp), %ebx
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Unregister@PLT
+; NUM-NEXT:    movl %ebx, %eax
+; NUM-NEXT:    addq $120, %rsp
+; NUM-NEXT:    popq %rbx
+; NUM-NEXT:    popq %r12
+; NUM-NEXT:    popq %r13
+; NUM-NEXT:    popq %r14
+; NUM-NEXT:    popq %r15
+; NUM-NEXT:    popq %rbp
+; NUM-NEXT:    retq
+; NUM-NEXT:  .LBB0_9:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -144(%rbp), %eax
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jb .LBB0_10
+; NUM-NEXT:  # %bb.11:
+; NUM-NEXT:    ud2
+; NUM-NEXT:  .LBB0_10:
+; NUM-NEXT:    leaq .LJTI0_0(%rip), %rcx
+; NUM-NEXT:    jmpq *(%rcx,%rax,8)
+; NUM-NEXT:  .LBB0_2: # %lpad
+; NUM-NEXT:  .Ltmp2:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -140(%rbp), %ecx
+; NUM-NEXT:    movl -136(%rbp), %eax
+; NUM-NEXT:    movq %rcx, -56(%rbp)
+; NUM-NEXT:    movl %eax, -64(%rbp)
+; NUM-NEXT:    cmpl $2, %eax
+; NUM-NEXT:    jne .LBB0_4
+; NUM-NEXT:  # %bb.3: # %catch3
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movl (%rax), %eax
+; NUM-NEXT:    movl %eax, -60(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpl $5, %eax
+; NUM-NEXT:    jmp .LBB0_6
+; NUM-NEXT:  .LBB0_4: # %catch.fallthrough
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jne .LBB0_8
+; NUM-NEXT:  # %bb.5: # %catch
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movzbl (%rax), %eax
+; NUM-NEXT:    movb %al, -45(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpb $3, %al
+; NUM-NEXT:  .LBB0_6: # %return
+; NUM-NEXT:    setne %cl
+; NUM-NEXT:    movl %ecx, -44(%rbp)
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_end_catch
+; NUM-NEXT:    jmp .LBB0_7
+; NUM-NEXT:  .LBB0_8: # %eh.resume
+; NUM-NEXT:    movl $-1, -144(%rbp)
+;
+; SJLJ-LABEL: main:
+; SJLJ:       # %bb.0: # %entry
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    pushq %rbp
+; SJLJ-NEXT:    movq %rsp, %rbp
+; SJLJ-NEXT:    pushq %r15
+; SJLJ-NEXT:    pushq %r14
+; SJLJ-NEXT:    pushq %r13
+; SJLJ-NEXT:    pushq %r12
+; SJLJ-NEXT:    pushq %rbx
+; SJLJ-NEXT:    subq $120, %rsp
+; SJLJ-NEXT:    movl $0, -44(%rbp)
+; SJLJ-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; SJLJ-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; SJLJ-NEXT:    movq %rbp, -104(%rbp)
+; SJLJ-NEXT:    movq %rsp, -88(%rbp)
+; SJLJ-NEXT:    movq $.LBB0_9, -96(%rbp)
+; SJLJ-NEXT:    movl $1, -144(%rbp)
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Register@PLT
+; SJLJ-NEXT:  .Ltmp0:
+; SJLJ-NEXT:    callq _Z3foov
+; SJLJ-NEXT:  .Ltmp1:
+; SJLJ-NEXT:  # %bb.1: # %invoke.cont
+; SJLJ-NEXT:    movl $1, -44(%rbp)
+; SJLJ-NEXT:  .LBB0_7: # %return
+; SJLJ-NEXT:    movl -44(%rbp), %ebx
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Unregister@PLT
+; SJLJ-NEXT:    movl %ebx, %eax
+; SJLJ-NEXT:    addq $120, %rsp
+; SJLJ-NEXT:    popq %rbx
+; SJLJ-NEXT:    popq %r12
+; SJLJ-NEXT:    popq %r13
+; SJLJ-NEXT:    popq %r14
+; SJLJ-NEXT:    popq %r15
+; SJLJ-NEXT:    popq %rbp
+; SJLJ-NEXT:    retq
+; SJLJ-NEXT:  .LBB0_9:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -144(%rbp), %eax
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jb .LBB0_10
+; SJLJ-NEXT:  # %bb.11:
+; SJLJ-NEXT:    ud2
+; SJLJ-NEXT:  .LBB0_10:
+; SJLJ-NEXT:    leaq .LJTI0_0(%rip), %rcx
+; SJLJ-NEXT:    jmpq *(%rcx,%rax,8)
+; SJLJ-NEXT:  .LBB0_2: # %lpad
+; SJLJ-NEXT:  .Ltmp2:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -140(%rbp), %ecx
+; SJLJ-NEXT:    movl -136(%rbp), %eax
+; SJLJ-NEXT:    movq %rcx, -56(%rbp)
+; SJLJ-NEXT:    movl %eax, -64(%rbp)
+; SJLJ-NEXT:    cmpl $2, %eax
+; SJLJ-NEXT:    jne .LBB0_4
+; SJLJ-NEXT:  # %bb.3: # %catch3
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movl (%rax), %eax
+; SJLJ-NEXT:    movl %eax, -60(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpl $5, %eax
+; SJLJ-NEXT:    jmp .LBB0_6
+; SJLJ-NEXT:  .LBB0_4: # %catch.fallthrough
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jne .LBB0_8
+; SJLJ-NEXT:  # %bb.5: # %catch
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movzbl (%rax), %eax
+; SJLJ-NEXT:    movb %al, -45(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpb $3, %al
+; SJLJ-NEXT:  .LBB0_6: # %return
+; SJLJ-NEXT:    setne %cl
+; SJLJ-NEXT:    movl %ecx, -44(%rbp)
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_end_catch
+; SJLJ-NEXT:    jmp .LBB0_7
+; SJLJ-NEXT:  .LBB0_8: # %eh.resume
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
 entry:
   %retval = alloca i32, align 4
   %exn.slot = alloca ptr
diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll
index 8fa7ce0..eb5d172 100644
--- a/llvm/test/CodeGen/X86/load-local-v3i129.ll
+++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll
@@ -12,7 +12,7 @@ define void @_start() nounwind {
 ; FAST-SHLD-NEXT:    shrq $2, %rcx
 ; FAST-SHLD-NEXT:    shldq $2, %rdx, %rcx
 ; FAST-SHLD-NEXT:    andq $-4, %rax
-; FAST-SHLD-NEXT:    orq $1, %rax
+; FAST-SHLD-NEXT:    incq %rax
 ; FAST-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; FAST-SHLD-NEXT:    movq %rcx, -32(%rsp)
 ; FAST-SHLD-NEXT:    orq $-2, -56(%rsp)
@@ -23,7 +23,7 @@ define void @_start() nounwind {
 ; SLOW-SHLD:       # %bb.0: # %Entry
 ; SLOW-SHLD-NEXT:    movq -40(%rsp), %rax
 ; SLOW-SHLD-NEXT:    andq $-4, %rax
-; SLOW-SHLD-NEXT:    orq $1, %rax
+; SLOW-SHLD-NEXT:    incq %rax
 ; SLOW-SHLD-NEXT:    movq %rax, -40(%rsp)
 ; SLOW-SHLD-NEXT:    orq $-2, -56(%rsp)
 ; SLOW-SHLD-NEXT:    movq $-1, -48(%rsp)
diff --git a/llvm/test/CodeGen/X86/pr23664.ll b/llvm/test/CodeGen/X86/pr23664.ll
index 453e5db..8179602 100644
--- a/llvm/test/CodeGen/X86/pr23664.ll
+++ b/llvm/test/CodeGen/X86/pr23664.ll
@@ -6,7 +6,7 @@ define i2 @f(i32 %arg) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    leal (%rdi,%rdi), %eax
-; CHECK-NEXT:    orb $1, %al
+; CHECK-NEXT:    incb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %trunc = trunc i32 %arg to i1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
new file mode 100644
index 0000000..32c7e82
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-trunc-nowrap.ll
@@ -0,0 +1,2213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2-SSSE3,SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+
+define <8 x i32> @trunc8i64_8i32_nsw(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_nsw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_nsw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_nsw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i32> @trunc8i64_8i32_nuw(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i32_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc8i64_8i32_nuw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_nuw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_nuw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i32_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i32>
+  ret <8 x i32> %0
+}
+
+define <8 x i16> @trunc8i64_8i16_nsw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i16_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i16_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define <8 x i16> @trunc8i64_8i16_nuw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i16_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i16_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i64_8i8_nsw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i8_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc8i64_8i8_nuw(<8 x i64> %a) {
+; SSE2-SSSE3-LABEL: trunc8i64_8i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i64_8i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm4 = [255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i64_8i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i64_8i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc8i64_8i8_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i64> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define <8 x i16> @trunc8i32_8i16_nsw(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define <8 x i16> @trunc8i32_8i16_nuw(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %0
+}
+
+define void @trunc8i32_8i8_nsw(<8 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc8i32_8i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc8i32_8i8_nuw(<8 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc8i32_8i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i32_8i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
+; SSE41-NEXT:    movq %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc8i32_8i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc8i32_8i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i32_8i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i32_8i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i32_8i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i32> %a to <8 x i8>
+  store <8 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_nsw(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i32> %a to <16 x i16>
+  store <16 x i16> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i16_nuw(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    pslld $16, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm1
+; SSSE3-NEXT:    pshufb %xmm4, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb %xmm4, %xmm3
+; SSSE3-NEXT:    pshufb %xmm4, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i16_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i16>
+  store <16 x i16> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_nsw(<16 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc16i32_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i32_16i8_nuw(<16 x i32> %a) {
+; SSE2-SSSE3-LABEL: trunc16i32_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i32_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i32_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i32_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: trunc16i32_16i8_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i32> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_nsw(<16 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc16i16_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i16_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i16> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc16i16_16i8_nuw(<16 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc16i16_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc16i16_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc16i16_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc16i16_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i16> %a to <16 x i8>
+  store <16 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc32i16_32i8_nsw(<32 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc32i16_32i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc32i16_32i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc32i16_32i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc32i16_32i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc32i16_32i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc32i16_32i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc32i16_32i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc32i16_32i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <32 x i16> %a to <32 x i8>
+  store <32 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define void @trunc32i16_32i8_nuw(<32 x i16> %a) {
+; SSE2-SSSE3-LABEL: trunc32i16_32i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-SSSE3-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc32i16_32i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm4, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    movdqu %xmm2, (%rax)
+; SSE41-NEXT:    movdqu %xmm0, (%rax)
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc32i16_32i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc32i16_32i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc32i16_32i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc32i16_32i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, (%rax)
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc32i16_32i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc32i16_32i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <32 x i16> %a to <32 x i8>
+  store <32 x i8> %0, ptr undef, align 4
+  ret void
+}
+
+define <8 x i32> @trunc2x4i64_8i32_nsw(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: trunc2x4i64_8i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32_nsw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x4i64_8i32_nsw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i64> %a to <4 x i32>
+  %1 = trunc nsw <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @trunc2x4i64_8i32_nuw(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: trunc2x4i64_8i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i32_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-SLOW:       # %bb.0: # %entry
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-FAST-ALL:       # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32_nuw:
+; AVX2-FAST-PERLANE:       # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: trunc2x4i64_8i32_nuw:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i64> %a to <4 x i32>
+  %1 = trunc nuw <4 x i64> %b to <4 x i32>
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16_nsw(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: trunc2x4i64_8i16_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i16_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i64> %a to <4 x i16>
+  %1 = trunc nsw <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc2x4i64_8i16_nuw(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-SSSE3-LABEL: trunc2x4i64_8i16_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i64_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x4i64_8i16_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x4i64_8i16_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i64_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i64> %a to <4 x i16>
+  %1 = trunc nuw <4 x i64> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32_nsw(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: trunc2x2i64_4i32_nsw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64_4i32_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x2i64_4i32_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <2 x i64> %a to <2 x i32>
+  %1 = trunc nsw <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @trunc2x2i64_4i32_nuw(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: trunc2x2i64_4i32_nuw:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: trunc2x2i64_4i32_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x2i64_4i32_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <2 x i64> %a to <2 x i32>
+  %1 = trunc nuw <2 x i64> %b to <2 x i32>
+  %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc2x4i32_8i16_nsw(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32_8i16_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32_8i16_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32_8i16_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <4 x i32> %a to <4 x i16>
+  %1 = trunc nsw <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc2x4i32_8i16_nuw(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: trunc2x4i32_8i16_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc2x4i32_8i16_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x4i32_8i16_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc2x4i32_8i16_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x4i32_8i16_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <4 x i32> %a to <4 x i16>
+  %1 = trunc nuw <4 x i32> %b to <4 x i16>
+  %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <32 x i8> @trunc2x16i16_32i8_nsw(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x16i16_32i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x16i16_32i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x16i16_32i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x16i16_32i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <16 x i16> %a to <16 x i8>
+  %1 = trunc nsw <16 x i16> %b to <16 x i8>
+  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %2
+}
+
+define <32 x i8> @trunc2x16i16_32i8_nuw(<16 x i16> %a, <16 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x16i16_32i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    packuswb %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x16i16_32i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm4, %xmm1
+; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    pand %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm4
+; SSE41-NEXT:    packuswb %xmm3, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x16i16_32i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x16i16_32i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x16i16_32i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <16 x i16> %a to <16 x i8>
+  %1 = trunc nuw <16 x i16> %b to <16 x i8>
+  %2 = shufflevector <16 x i8> %0, <16 x i8> %1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %2
+}
+
+define <16 x i8> @trunc2x8i16_16i8_nsw(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nsw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x8i16_16i8_nsw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8_nsw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x8i16_16i8_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i16> %a to <8 x i8>
+  %1 = trunc nsw <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc2x8i16_16i8_nuw(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-SSSE3-LABEL: trunc2x8i16_16i8_nuw:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc2x8i16_16i8_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm2, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    packuswb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: trunc2x8i16_16i8_nuw:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: trunc2x8i16_16i8_nuw:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc2x8i16_16i8_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i16> %a to <8 x i8>
+  %1 = trunc nuw <8 x i16> %b to <8 x i8>
+  %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %2
+}
+
+define i64 @trunc8i16_i64_nsw(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64_nsw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16_i64_nsw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16_i64_nsw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16_i64_nsw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i16_i64_nsw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i16_i64_nsw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i16_i64_nsw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i16_i64_nsw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nsw <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
+
+define i64 @trunc8i16_i64_nuw(<8 x i16> %inval) {
+; SSE2-LABEL: trunc8i16_i64_nuw:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc8i16_i64_nuw:
+; SSSE3:       # %bb.0: # %entry
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq %xmm0, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc8i16_i64_nuw:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq %xmm0, %rax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc8i16_i64_nuw:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512F-LABEL: trunc8i16_i64_nuw:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: trunc8i16_i64_nuw:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: trunc8i16_i64_nuw:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512BWVL-LABEL: trunc8i16_i64_nuw:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
+; AVX512BWVL-NEXT:    retq
+entry:
+  %0 = trunc nuw <8 x i16> %inval to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to i64
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 691ca40..f7a27a5 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -65,6 +65,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -74,6 +75,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movb %al, (%rdx)
@@ -81,14 +83,15 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ;
 ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2:       # %bb.0:
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NEXT:    movb %al, (%edx)
+; X86-NO-BMI2-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NEXT:    retl
 ;
 ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
@@ -97,6 +100,7 @@ define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movb %cl, (%eax)
@@ -119,6 +123,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-NO-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shrl %cl, %eax
@@ -128,6 +133,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
 ; X64-BMI2:       # %bb.0:
 ; X64-BMI2-NEXT:    movzwl (%rdi), %eax
+; X64-BMI2-NEXT:    movzwl %ax, %eax
 ; X64-BMI2-NEXT:    shll $3, %esi
 ; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
 ; X64-BMI2-NEXT:    movw %ax, (%rdx)
@@ -139,6 +145,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-NO-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-NO-BMI2-NEXT:    shll $3, %ecx
 ; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NEXT:    shrl %cl, %edx
@@ -151,6 +158,7 @@ define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI2-NEXT:    movzwl (%edx), %edx
+; X86-BMI2-NEXT:    movzwl %dx, %edx
 ; X86-BMI2-NEXT:    shll $3, %ecx
 ; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
 ; X86-BMI2-NEXT:    movw %cx, (%eax)
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
index 8d96ab0..f75042b 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out-no-ps.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=0 | FileCheck %s --check-prefix=FULL
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S  \
-; RUN:   -hwasan-selective-instrumentation=1 | FileCheck %s --check-prefix=SELSAN
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S                                      | FileCheck %s --check-prefix=FULL
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=SELSAN
 
 ; FULL: @not_sanitized
 ; FULL-NEXT: %x = alloca i8, i64 4
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
index 28e43a9..ab3f56d 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/pgo-opt-out.ll
@@ -1,23 +1,19 @@
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    | FileCheck %s --check-prefix=DEFAULT
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT_RATE
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM_RATE_0
-; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-selective-instrumentation=1 \
-; RUN:    -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM_RATE_1
-
-; DEFAULT: @sanitized
-; DEFAULT-NEXT: %x = alloca i8, i64 4
-
-; HOT_RATE: @sanitized
-; HOT_RATE-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_0: @sanitized
-; RANDOM_RATE_0-NEXT: @__hwasan_tls
-
-; RANDOM_RATE_1: @sanitized
-; RANDOM_RATE_1-NEXT: %x = alloca i8, i64 4
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=700000 | FileCheck %s --check-prefix=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-percentile-cutoff-hot=990000 | FileCheck %s --check-prefix=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=0.0 | FileCheck %s --check-prefix=RANDOM0
+; RUN: opt < %s -passes='require<profile-summary>,hwasan' -S -hwasan-random-skip-rate=1.0 | FileCheck %s --check-prefix=RANDOM1
+
+; HOT70: @sanitized
+; HOT70-NEXT: @__hwasan_tls
+
+; HOT99: @sanitized
+; HOT99-NEXT: %x = alloca i8, i64 4
+
+; RANDOM0: @sanitized
+; RANDOM0-NEXT: @__hwasan_tls
+
+; RANDOM1: @sanitized
+; RANDOM1-NEXT: %x = alloca i8, i64 4
 
 declare void @use(ptr)
 
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
index 056221f..58b7847 100644
--- a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
@@ -23,3 +23,24 @@ v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
 
 v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1150: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f32_e64_dpp v5, v1, s2 row_mirror
+// GFX1150: encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_min3_f16 v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, v2 row_mirror
+// GFX1150: encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, s2 row_mirror
+// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_cmp_le_f32 vcc_lo, v1, s2 quad_perm:[1,1,1,1]
+// GFX1150: encoding: [0x6a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x55,0x00,0xff]
+
+v_cmpx_neq_f16 v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f16 v1, 2.0 quad_perm:[1,1,1,1]
+// GFX1150: encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x55,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index da1989e..3ec3162 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -51,13 +51,13 @@ v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -135,7 +135,7 @@ v_fmac_f16_e64_dpp v5, s2, v3 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_fmac_f16_e64_dpp v5, v2, 1.0 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_fmac_f32_e64_dpp v5, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -144,7 +144,7 @@ v_fmac_f32_e64_dpp v5, 0x1234, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_fmac_f32_e64_dpp v5, v2, 1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: src1 immediate operand invalid for instruction
 
 v_fmac_f32_e64_dpp v5, -1.0, v3 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
index bb911c6..7393de2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
@@ -6,22 +6,49 @@
 //
 
 v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
 v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
 
 v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
 
 v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
 
 v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1150: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmp_le_f32 vcc_lo, v1, v2 row_mirror
+// GFX12: encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_mirror
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_half_mirror
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x41,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_shl:15
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_shr:1
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x11,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp s5, v1, s99 row_ror:1
+// GFX12: encoding: [0x05,0x00,0x12,0xd4,0xfa,0xc6,0x00,0x00,0x01,0x21,0x01,0xff]
+
+v_cmp_eq_f32_e64_dpp vcc_hi, |v1|, -s99 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: encoding: [0x6b,0x01,0x12,0xd4,0xfa,0xc6,0x00,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmp_eq_f32_e64_dpp ttmp15, -v1, |s99| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: encoding: [0x7b,0x02,0x12,0xd4,0xfa,0xc6,0x00,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_gt_f32_e64_dpp v255, 4.0 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: encoding: [0x7e,0x00,0x94,0xd4,0xe9,0xec,0x01,0x00,0xff,0x00,0x00,0x00]
 
 //
 // Elements of CPol operand can be given in any order
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 88bdb7e..d0e309a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -6,6 +6,12 @@
 v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -57,6 +63,10 @@ v_add_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -113,6 +123,10 @@ v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x00,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -155,6 +169,12 @@ v_add_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_add_lshl_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add_lshl_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -323,6 +343,12 @@ v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_alignbit_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_alignbit_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -365,6 +391,12 @@ v_alignbit_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_alignbyte_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_alignbyte_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -449,6 +481,12 @@ v_and_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_and_or_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_and_or_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -575,6 +613,12 @@ v_bcnt_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0
 v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfe_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfe_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -617,6 +661,12 @@ v_bfe_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfe_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfe_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -659,6 +709,12 @@ v_bfe_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_bfi_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_bfi_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -752,6 +808,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
 // W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror
+// W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror
+// W32: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
 // W32: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -808,6 +872,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
 // W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror
+// W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror
+// W64: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
 // W64: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -850,6 +922,12 @@ v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 ban
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubeid_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubeid_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -892,6 +970,12 @@ v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubema_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubema_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -934,6 +1018,12 @@ v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubesc_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubesc_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -976,6 +1066,12 @@ v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cubetc_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cubetc_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1378,6 +1474,12 @@ v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_cvt_pk_u8_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_cvt_pk_u8_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1588,6 +1690,12 @@ v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 b
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_div_fixup_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_div_fixup_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1630,6 +1738,12 @@ v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 ro
 v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_fma_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1672,6 +1786,12 @@ v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_fma_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_fma_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1756,6 +1876,9 @@ v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 row_xmask:15 row_mask:0x3 ba
 v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lerp_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1798,6 +1921,12 @@ v_lerp_u8_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lshl_add_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_lshl_add_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1840,6 +1969,12 @@ v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_lshl_or_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_lshl_or_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -1966,6 +2101,12 @@ v_lshrrev_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2008,6 +2149,12 @@ v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i32_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i32_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2050,6 +2197,12 @@ v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_i32_i24_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i32_i24_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2092,6 +2245,12 @@ v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2134,6 +2293,12 @@ v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u32_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u32_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2176,6 +2341,12 @@ v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mad_u32_u24_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_u32_u24_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2218,6 +2389,12 @@ v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2260,6 +2437,12 @@ v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2302,6 +2485,12 @@ v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2344,6 +2533,12 @@ v_max3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2386,6 +2581,12 @@ v_max3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2428,6 +2629,12 @@ v_max3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_max3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_max3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2554,6 +2761,12 @@ v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2596,6 +2809,12 @@ v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2638,6 +2857,12 @@ v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2680,6 +2905,12 @@ v_maxmin_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maxmin_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2806,6 +3037,12 @@ v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2848,6 +3085,12 @@ v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2890,6 +3133,12 @@ v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2932,6 +3181,12 @@ v_med3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -2974,6 +3229,12 @@ v_med3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3016,6 +3277,12 @@ v_med3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_med3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_med3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3058,6 +3325,12 @@ v_med3_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3100,6 +3373,12 @@ v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3142,6 +3421,12 @@ v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:
 v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_i16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_i16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3184,6 +3469,12 @@ v_min3_i16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3226,6 +3517,12 @@ v_min3_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3268,6 +3565,12 @@ v_min3_u16_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_min3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_min3_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3394,6 +3697,12 @@ v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_num_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3436,6 +3745,12 @@ v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_num_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_num_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3478,6 +3793,12 @@ v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmas
 v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_i32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_i32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3520,6 +3841,12 @@ v_minmax_i32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minmax_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minmax_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3562,6 +3889,9 @@ v_minmax_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_ma
 v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_msad_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3646,6 +3976,12 @@ v_mul_lo_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bo
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_mullit_f32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_mullit_f32_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3688,6 +4024,12 @@ v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_or3_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_or3_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3814,6 +4156,12 @@ v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mas
 v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_perm_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_perm_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3856,6 +4204,9 @@ v_perm_b32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_hi_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3898,6 +4249,12 @@ v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 ba
 v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_sad_u16_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3940,6 +4297,12 @@ v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_sad_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -3982,6 +4345,9 @@ v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp row_xmask:15 row_mask:0x3 bank
 v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_sad_u8_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
 v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4033,6 +4399,10 @@ v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4089,6 +4459,10 @@ v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x01,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4266,6 +4640,10 @@ v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_mirror
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s6, v1, s2 row_mirror
+// W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 row_half_mirror
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4322,6 +4700,10 @@ v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_half_mirror
 // W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s[12:13], v1, s2 row_half_mirror
+// W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x00,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s[12:13], v1, v2 row_shl:1
 // W64: [0x05,0x0c,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4364,6 +4746,12 @@ v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 b
 v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_xad_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_xad_u32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4406,6 +4794,12 @@ v_xad_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:
 v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_xor3_b32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_xor3_b32_e64_dpp v5, v1, 10, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x14,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -4770,7 +5164,7 @@ v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x04,0x00]
 
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
@@ -4791,7 +5185,7 @@ v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_ma
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0c,0x04,0x01,0xe4,0x00,0x00]
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
@@ -4973,6 +5367,12 @@ v_maximum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bou
 v_minimum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimum3_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5015,6 +5415,12 @@ v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_m
 v_maximum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximum3_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximum3_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximum3_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5057,6 +5463,12 @@ v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_m
 v_minimum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5099,6 +5511,12 @@ v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x
 v_maximum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5180,6 +5598,12 @@ v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimummaximum_f32 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimummaximum_f32 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimummaximum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5222,6 +5646,12 @@ v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15
 v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_maximumminimum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximumminimum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
@@ -5264,6 +5694,12 @@ v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_m
 v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
+v_minimummaximum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimummaximum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
 v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 0e84765..25b13ac 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -6,6 +6,12 @@
 v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_add3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -47,6 +53,10 @@ v_add_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s105, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x69,0x00,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, vcc_lo, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x6a,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -67,6 +77,10 @@ v_add_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_add_co_u32_e64_dpp v5, s[104:105], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x68,0x00,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_add_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x6a,0x00,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -81,6 +95,12 @@ v_add_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_add_lshl_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_add_lshl_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -144,6 +164,12 @@ v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_alignbit_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_alignbit_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -177,6 +203,12 @@ v_alignbit_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_alignbyte_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_alignbyte_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -219,6 +251,12 @@ v_and_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_and_or_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_and_or_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -273,6 +311,12 @@ v_bcnt_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfe_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfe_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -309,6 +353,12 @@ v_bfe_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfe_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfe_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -345,6 +395,12 @@ v_bfe_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_bfi_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_bfi_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -391,6 +447,14 @@ v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -423,12 +487,22 @@ v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
 // W64: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubeid_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubeid_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -465,6 +539,12 @@ v_cubeid_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubema_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubema_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -501,6 +581,12 @@ v_cubema_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubesc_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubesc_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -537,6 +623,12 @@ v_cubesc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cubetc_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cubetc_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -687,6 +779,12 @@ v_cvt_pk_u16_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_cvt_pk_u8_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_u8_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -771,6 +869,12 @@ v_cvt_pk_norm_u16_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_div_fixup_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_div_fixup_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x54,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -807,6 +911,12 @@ v_div_fixup_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0
 v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_fma_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x48,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -843,6 +953,12 @@ v_fma_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0
 v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_fma_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_fma_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -891,6 +1007,9 @@ v_ldexp_f32_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lerp_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -927,6 +1046,12 @@ v_lerp_u8_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lshl_add_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_lshl_add_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -963,6 +1088,12 @@ v_lshl_add_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_lshl_or_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_lshl_or_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1017,6 +1148,12 @@ v_lshrrev_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1050,6 +1187,12 @@ v_mad_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i32_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i32_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1086,6 +1229,12 @@ v_mad_i32_i16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_i32_i24_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_i32_i24_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1122,6 +1271,12 @@ v_mad_i32_i24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1155,6 +1310,12 @@ v_mad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u32_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u32_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1191,6 +1352,12 @@ v_mad_u32_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mad_u32_u24_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mad_u32_u24_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1227,6 +1394,12 @@ v_mad_u32_u24_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:
 v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1263,6 +1436,12 @@ v_max3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1299,6 +1478,12 @@ v_max3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1332,6 +1517,12 @@ v_max3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1368,6 +1559,12 @@ v_max3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1401,6 +1598,12 @@ v_max3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_max3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_max3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1455,6 +1658,12 @@ v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1491,6 +1700,12 @@ v_maxmin_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x69,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1527,6 +1742,12 @@ v_maxmin_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1563,6 +1784,12 @@ v_maxmin_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maxmin_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maxmin_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1617,6 +1844,12 @@ v_mbcnt_lo_u32_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1653,6 +1886,12 @@ v_med3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1689,6 +1928,12 @@ v_med3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1722,6 +1967,12 @@ v_med3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1758,6 +2009,12 @@ v_med3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1791,6 +2048,12 @@ v_med3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_med3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_med3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1827,6 +2090,12 @@ v_med3_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1863,6 +2132,12 @@ v_min3_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,
 v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x29,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1899,6 +2174,12 @@ v_min3_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,
 v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_i16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_i16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1932,6 +2213,12 @@ v_min3_i16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -1968,6 +2255,12 @@ v_min3_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2001,6 +2294,12 @@ v_min3_u16_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_min3_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_min3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2055,6 +2354,12 @@ v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_num_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2091,6 +2396,12 @@ v_minmax_num_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_num_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_num_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x68,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2127,6 +2438,12 @@ v_minmax_num_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,
 v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_i32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_i32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2163,6 +2480,12 @@ v_minmax_i32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minmax_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minmax_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2199,6 +2522,9 @@ v_minmax_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_msad_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2244,6 +2570,12 @@ v_mul_lo_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_mullit_f32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_mullit_f32_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2280,6 +2612,12 @@ v_mullit_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_or3_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_or3_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2337,6 +2675,12 @@ v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_perm_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_perm_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2373,6 +2717,9 @@ v_perm_b32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_hi_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2409,6 +2756,12 @@ v_sad_hi_u8_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_sad_u16_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2445,6 +2798,12 @@ v_sad_u16_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_sad_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2481,6 +2840,9 @@ v_sad_u32_e64_dpp v255, v255, v255, src_scc clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_sad_u8_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
 v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2518,6 +2880,10 @@ v_sub_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s6, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x06,0x01,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2538,6 +2904,10 @@ v_sub_co_u32_e64_dpp v5, s[12:13], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x0c,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_sub_co_u32_e64_dpp v5, s[12:13], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x0c,0x01,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_sub_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x01,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2584,6 +2954,10 @@ v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s6, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x69,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2608,6 +2982,10 @@ v_subrev_co_u32_e64_dpp v5, s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x68,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_subrev_co_u32_e64_dpp v5, s[104:105], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x68,0x02,0xd7,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_subrev_co_u32_e64_dpp v5, vcc, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x6a,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2622,6 +3000,12 @@ v_subrev_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_xad_u32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_xad_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2658,6 +3042,12 @@ v_xad_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_xor3_b32_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_xor3_b32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -2983,7 +3373,7 @@ v_dot2_f16_f16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_f16_f16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
@@ -3004,7 +3394,7 @@ v_dot2_bf16_bf16_e64_dpp v0, s1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, s2, v3 dpp8:[0,1,2,3,4,4,4,4]
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX12: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x88,0x46,0x92]
 
 v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
@@ -3066,6 +3456,12 @@ v_maximum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_minimum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimum3_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3102,6 +3498,12 @@ v_minimum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_maximum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximum3_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximum3_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximum3_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3138,6 +3540,12 @@ v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_minimum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3174,6 +3582,12 @@ v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] f
 v_maximum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3210,6 +3624,12 @@ v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] f
 v_maximumminimum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximumminimum_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximumminimum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3246,6 +3666,12 @@ v_maximumminimum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_minimummaximum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimummaximum_f32 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimummaximum_f32 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimummaximum_f32 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3282,6 +3708,12 @@ v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,
 v_maximumminimum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_maximumminimum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_maximumminimum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
@@ -3318,6 +3750,12 @@ v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,
 v_minimummaximum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
+v_minimummaximum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimummaximum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
 v_minimummaximum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
index ab88ec8..2b7830c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp16.s
@@ -128,6 +128,12 @@ v_add_f16_e64_dpp v5, v1, v2 row_shl:1
 v_add_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x32,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -170,6 +176,12 @@ v_add_f32_e64_dpp v5, v1, v2 row_shl:1
 v_add_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -212,6 +224,12 @@ v_add_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_add_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_add_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_add_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -254,6 +272,12 @@ v_and_b32_e64_dpp v5, v1, v2 row_shl:1
 v_and_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_and_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_and_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_and_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -296,6 +320,12 @@ v_ashrrev_i32_e64_dpp v5, v1, v2 row_shl:1
 v_ashrrev_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_ashrrev_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ashrrev_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_ashrrev_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -445,6 +475,12 @@ v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shl:1
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -487,6 +523,12 @@ v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shl:1
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -529,6 +571,12 @@ v_ldexp_f16_e64_dpp v5, v1, v2 row_shl:1
 v_ldexp_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_ldexp_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -571,6 +619,12 @@ v_lshlrev_b32_e64_dpp v5, v1, v2 row_shl:1
 v_lshlrev_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_lshlrev_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_lshlrev_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_lshlrev_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -613,6 +667,12 @@ v_lshrrev_b32_e64_dpp v5, v1, v2 row_shl:1
 v_lshrrev_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_lshrrev_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_lshrrev_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_lshrrev_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -655,6 +715,12 @@ v_max_num_f16_e64_dpp v5, v1, v2 row_shl:1
 v_max_num_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_num_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_num_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x31,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_num_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -697,6 +763,12 @@ v_max_num_f32_e64_dpp v5, v1, v2 row_shl:1
 v_max_num_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_num_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_num_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x16,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_num_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -739,6 +811,12 @@ v_max_i32_e64_dpp v5, v1, v2 row_shl:1
 v_max_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -781,6 +859,12 @@ v_max_u32_e64_dpp v5, v1, v2 row_shl:1
 v_max_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_max_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_max_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_max_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -823,6 +907,12 @@ v_min_num_f16_e64_dpp v5, v1, v2 row_shl:1
 v_min_num_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_num_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_num_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x30,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_num_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -865,6 +955,12 @@ v_min_num_f32_e64_dpp v5, v1, v2 row_shl:1
 v_min_num_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_num_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_num_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x15,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_num_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -907,6 +1003,12 @@ v_min_i32_e64_dpp v5, v1, v2 row_shl:1
 v_min_i32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_i32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_i32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_i32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -949,6 +1051,12 @@ v_min_u32_e64_dpp v5, v1, v2 row_shl:1
 v_min_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_min_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_min_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_min_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -991,6 +1099,12 @@ v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_dx9_zero_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_dx9_zero_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1033,6 +1147,12 @@ v_mul_f16_e64_dpp v5, v1, v2 row_shl:1
 v_mul_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x35,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1075,6 +1195,12 @@ v_mul_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x08,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1117,6 +1243,12 @@ v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_hi_i32_i24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_hi_i32_i24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1159,6 +1291,12 @@ v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_hi_u32_u24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_hi_u32_u24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1201,6 +1339,12 @@ v_mul_i32_i24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_i32_i24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_i32_i24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_i32_i24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_i32_i24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1243,6 +1387,12 @@ v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shl:1
 v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_legacy_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_legacy_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x07,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_legacy_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1285,6 +1435,12 @@ v_mul_u32_u24_e64_dpp v5, v1, v2 row_shl:1
 v_mul_u32_u24_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_mul_u32_u24_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_mul_u32_u24_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_mul_u32_u24_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1327,6 +1483,12 @@ v_or_b32_e64_dpp v5, v1, v2 row_shl:1
 v_or_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_or_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_or_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_or_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1476,6 +1638,12 @@ v_sub_f16_e64_dpp v5, v1, v2 row_shl:1
 v_sub_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x33,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1518,6 +1686,12 @@ v_sub_f32_e64_dpp v5, v1, v2 row_shl:1
 v_sub_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1560,6 +1734,12 @@ v_sub_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_sub_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_sub_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_sub_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1709,6 +1889,12 @@ v_subrev_f16_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_f16_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_f16_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_f16_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x34,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_f16_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1751,6 +1937,12 @@ v_subrev_f32_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_f32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_f32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_f32_e64_dpp v5, v1, 2.0 row_shl:15
+// GFX12: [0x05,0x00,0x05,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_f32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1793,6 +1985,12 @@ v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shl:1
 v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_subrev_nc_u32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_subrev_nc_u32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_subrev_nc_u32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1835,6 +2033,12 @@ v_xnor_b32_e64_dpp v5, v1, v2 row_shl:1
 v_xnor_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_xnor_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_xnor_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_xnor_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
@@ -1877,6 +2081,12 @@ v_xor_b32_e64_dpp v5, v1, v2 row_shl:1
 v_xor_b32_e64_dpp v5, v1, v2 row_shl:15
 // GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
+v_xor_b32_e64_dpp v5, v1, s2 row_shl:15
+// GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x00,0x00,0x01,0x0f,0x01,0xff]
+
+v_xor_b32_e64_dpp v5, v1, 10 row_shl:15
+// GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x14,0x01,0x00,0x01,0x0f,0x01,0xff]
+
 v_xor_b32_e64_dpp v5, v1, v2 row_shr:1
 // GFX12: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
index dc151d66..b18029d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop2_dpp8.s
@@ -45,6 +45,12 @@ v_add_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,0,0
 v_add_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x32,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -57,6 +63,12 @@ v_add_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -69,6 +81,12 @@ v_add_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_add_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x25,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x25,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -78,6 +96,12 @@ v_add_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_and_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_and_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1b,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1b,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -87,6 +111,12 @@ v_and_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_ashrrev_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_ashrrev_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1a,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1a,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -97,14 +127,30 @@ v_cndmask_b32_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b32_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa4,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xac,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x01,0x01,0xd5,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -117,10 +163,22 @@ v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x18,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cndmask_b32_e64_dpp v5, v1, s2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa0,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cndmask_b32_e64_dpp v5, v1, 10, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x05,0x00,0x01,0xd5,0xe9,0x14,0xa1,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cndmask_b32_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x05,0x01,0x01,0xd5,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -135,6 +193,12 @@ v_cndmask_b32_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cvt_pk_rtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
@@ -147,6 +211,12 @@ v_cvt_pk_rtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0]
 v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pkrtz_f16_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x2f,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cvt_pkrtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
@@ -159,9 +229,18 @@ v_cvt_pkrtz_f16_f32_e64_dpp v255, -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0]
 v_ldexp_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
 
+v_ldexp_f16_e64_dpp v5, v1, s2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x00,0x08,0x01,0x77,0x39,0x05]
+
+v_ldexp_f16_e64_dpp v5, v1, 2.0 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x3b,0xd5,0xe9,0xe8,0x01,0x08,0x01,0x77,0x39,0x05]
+
 v_ldexp_f16_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x3b,0xd5,0xea,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 
@@ -171,6 +250,12 @@ v_ldexp_f16_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_lshlrev_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_lshlrev_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x18,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x18,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -180,6 +265,12 @@ v_lshlrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_lshrrev_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_lshrrev_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x19,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x19,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -189,6 +280,12 @@ v_lshrrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_num_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_num_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x31,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -201,6 +298,12 @@ v_max_num_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_max_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_num_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_num_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x16,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -213,6 +316,12 @@ v_max_num_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x12,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x12,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -222,6 +331,12 @@ v_max_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_max_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_max_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x14,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x14,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -231,6 +346,12 @@ v_max_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_num_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_num_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x30,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -243,6 +364,12 @@ v_min_num_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_min_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_num_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_num_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x15,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -255,6 +382,12 @@ v_min_num_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0]
 v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_i32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_i32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x11,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x11,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -264,6 +397,12 @@ v_min_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_min_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_min_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x13,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x13,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -273,6 +412,12 @@ v_min_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_dx9_zero_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_dx9_zero_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_dx9_zero_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -285,6 +430,12 @@ v_mul_dx9_zero_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,
 v_mul_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x35,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -297,6 +448,12 @@ v_mul_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x08,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -309,6 +466,12 @@ v_mul_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_hi_i32_i24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_hi_i32_i24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0a,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0a,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -318,6 +481,12 @@ v_mul_hi_i32_i24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_hi_u32_u24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_hi_u32_u24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0c,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0c,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -327,6 +496,12 @@ v_mul_hi_u32_u24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_i32_i24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_i32_i24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x09,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x09,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -336,6 +511,12 @@ v_mul_i32_i24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_mul_legacy_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_legacy_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_legacy_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x07,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_legacy_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -348,6 +529,12 @@ v_mul_legacy_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,
 v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_mul_u32_u24_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_mul_u32_u24_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x0b,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0b,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -357,6 +544,12 @@ v_mul_u32_u24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_or_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_or_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1c,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1c,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -405,6 +598,12 @@ v_sub_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,0,0
 v_sub_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x33,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -417,6 +616,12 @@ v_sub_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sub_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -429,6 +634,12 @@ v_sub_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_sub_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x26,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x26,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -477,6 +688,12 @@ v_subrev_co_ci_u32_e64_dpp v255, null, v255, v255, null clamp dpp8:[0,0,0,0,0,0,
 v_subrev_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_f16_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_f16_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x34,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -489,6 +706,12 @@ v_subrev_f16_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] f
 v_subrev_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_f32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_f32_e64_dpp v5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x05,0xd5,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 
@@ -501,6 +724,12 @@ v_subrev_f32_e64_dpp v255, -|v255|, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] f
 v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_subrev_nc_u32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_subrev_nc_u32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x27,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x27,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -510,6 +739,12 @@ v_subrev_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_xnor_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_xnor_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1e,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1e,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
@@ -519,6 +754,12 @@ v_xnor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_xor_b32_e64_dpp v5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_xor_b32_e64_dpp v5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x00,0x1d,0xd5,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x1d,0xd5,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
index b50b18e..037fa39 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp16.s
@@ -7,6 +7,14 @@ v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -59,6 +67,14 @@ v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -114,6 +130,14 @@ v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -166,6 +190,14 @@ v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -221,6 +253,14 @@ v_cmp_eq_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -273,6 +313,14 @@ v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -328,6 +376,14 @@ v_cmp_eq_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -380,6 +436,14 @@ v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -435,6 +499,14 @@ v_cmp_eq_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -487,6 +559,14 @@ v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -542,6 +622,14 @@ v_cmp_eq_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -594,6 +682,14 @@ v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -649,6 +745,14 @@ v_cmp_eq_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -701,6 +805,14 @@ v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -756,6 +868,14 @@ v_cmp_eq_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -808,6 +928,14 @@ v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -863,6 +991,14 @@ v_cmp_ge_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -915,6 +1051,14 @@ v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -970,6 +1114,14 @@ v_cmp_ge_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1022,6 +1174,14 @@ v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1077,6 +1237,14 @@ v_cmp_ge_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1129,6 +1297,14 @@ v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1184,6 +1360,14 @@ v_cmp_ge_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1236,6 +1420,14 @@ v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1291,6 +1483,14 @@ v_cmp_ge_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1343,6 +1543,14 @@ v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1398,6 +1606,14 @@ v_cmp_ge_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1450,6 +1666,14 @@ v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1505,6 +1729,14 @@ v_cmp_gt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1557,6 +1789,14 @@ v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1612,6 +1852,14 @@ v_cmp_gt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1664,6 +1912,14 @@ v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1719,6 +1975,14 @@ v_cmp_gt_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1771,6 +2035,14 @@ v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1826,6 +2098,14 @@ v_cmp_gt_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1878,6 +2158,14 @@ v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1933,6 +2221,14 @@ v_cmp_gt_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1985,6 +2281,14 @@ v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2040,6 +2344,14 @@ v_cmp_gt_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2092,6 +2404,14 @@ v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2147,6 +2467,14 @@ v_cmp_le_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2199,6 +2527,14 @@ v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2254,6 +2590,14 @@ v_cmp_le_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2306,6 +2650,14 @@ v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2361,6 +2713,14 @@ v_cmp_le_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2413,6 +2773,14 @@ v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2468,6 +2836,14 @@ v_cmp_le_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2520,6 +2896,14 @@ v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2575,6 +2959,14 @@ v_cmp_le_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2627,6 +3019,14 @@ v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2682,6 +3082,14 @@ v_cmp_le_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2734,6 +3142,14 @@ v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2789,6 +3205,14 @@ v_cmp_lg_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2841,6 +3265,14 @@ v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2896,6 +3328,14 @@ v_cmp_lg_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2948,6 +3388,14 @@ v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3003,6 +3451,14 @@ v_cmp_lt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3055,6 +3511,14 @@ v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3110,6 +3574,14 @@ v_cmp_lt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3162,6 +3634,14 @@ v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3217,6 +3697,14 @@ v_cmp_lt_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3269,6 +3757,14 @@ v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3324,6 +3820,14 @@ v_cmp_lt_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3376,6 +3880,14 @@ v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3431,6 +3943,14 @@ v_cmp_lt_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3483,6 +4003,14 @@ v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3538,6 +4066,14 @@ v_cmp_lt_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3590,6 +4126,14 @@ v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3645,6 +4189,14 @@ v_cmp_ne_i16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3697,6 +4249,14 @@ v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3752,6 +4312,14 @@ v_cmp_ne_i32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3804,6 +4372,14 @@ v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3859,6 +4435,14 @@ v_cmp_ne_u16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3911,6 +4495,14 @@ v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -3966,6 +4558,14 @@ v_cmp_ne_u32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s5, v1, 10 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4018,6 +4618,14 @@ v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s[10:11], v1, 10 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4073,6 +4681,14 @@ v_cmp_neq_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4125,6 +4741,14 @@ v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4180,6 +4804,14 @@ v_cmp_neq_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4232,6 +4864,14 @@ v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4287,6 +4927,14 @@ v_cmp_nge_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4339,6 +4987,14 @@ v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4394,6 +5050,14 @@ v_cmp_nge_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4446,6 +5110,14 @@ v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4501,6 +5173,14 @@ v_cmp_ngt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4553,6 +5233,14 @@ v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4608,6 +5296,14 @@ v_cmp_ngt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4660,6 +5356,14 @@ v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4715,6 +5419,14 @@ v_cmp_nle_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4767,6 +5479,14 @@ v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4822,6 +5542,14 @@ v_cmp_nle_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4874,6 +5602,14 @@ v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4929,6 +5665,14 @@ v_cmp_nlg_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -4981,6 +5725,14 @@ v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5036,6 +5788,14 @@ v_cmp_nlg_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5088,6 +5848,14 @@ v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5143,6 +5911,14 @@ v_cmp_nlt_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5195,6 +5971,14 @@ v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5250,6 +6034,14 @@ v_cmp_nlt_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5302,6 +6094,14 @@ v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5357,6 +6157,14 @@ v_cmp_o_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5409,6 +6217,14 @@ v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5464,6 +6280,14 @@ v_cmp_o_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5516,6 +6340,14 @@ v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5571,6 +6403,14 @@ v_cmp_u_f16_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5623,6 +6463,14 @@ v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5678,6 +6526,14 @@ v_cmp_u_f32_e64_dpp s5, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s5, v1, s2 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s5, v1, 2.0 quad_perm:[3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s5, v1, v2 quad_perm:[0,1,2,3]
 // W32: [0x05,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -5730,6 +6586,14 @@ v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0]
 // W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s[10:11], v1, s2 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s[10:11], v1, 2.0 quad_perm:[3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s[10:11], v1, v2 quad_perm:[0,1,2,3]
 // W64: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
index b9dc614..c5ba45e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3c_dpp8.s
@@ -7,6 +7,14 @@ v_cmp_class_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -27,6 +35,14 @@ v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -46,6 +62,14 @@ v_cmp_class_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x7e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -66,6 +90,14 @@ v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_class_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_class_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x7e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -85,6 +117,14 @@ v_cmp_eq_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x02,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -105,6 +145,14 @@ v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x02,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -124,6 +172,14 @@ v_cmp_eq_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x12,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -144,6 +200,14 @@ v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x12,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -163,6 +227,14 @@ v_cmp_eq_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x32,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -183,6 +255,14 @@ v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x32,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -202,6 +282,14 @@ v_cmp_eq_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x42,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -222,6 +310,14 @@ v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x42,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -241,6 +337,14 @@ v_cmp_eq_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -261,6 +365,14 @@ v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -280,6 +392,14 @@ v_cmp_eq_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -300,6 +420,14 @@ v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_eq_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_eq_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4a,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -319,6 +447,14 @@ v_cmp_ge_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x06,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -339,6 +475,14 @@ v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x06,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -358,6 +502,14 @@ v_cmp_ge_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x16,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -378,6 +530,14 @@ v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x16,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -397,6 +557,14 @@ v_cmp_ge_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x36,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -417,6 +585,14 @@ v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x36,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -436,6 +612,14 @@ v_cmp_ge_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x46,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -456,6 +640,14 @@ v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x46,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -475,6 +667,14 @@ v_cmp_ge_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -495,6 +695,14 @@ v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -514,6 +722,14 @@ v_cmp_ge_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -534,6 +750,14 @@ v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ge_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -553,6 +777,14 @@ v_cmp_gt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x04,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -573,6 +805,14 @@ v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x04,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -592,6 +832,14 @@ v_cmp_gt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x14,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -612,6 +860,14 @@ v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x14,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -631,6 +887,14 @@ v_cmp_gt_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x34,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -651,6 +915,14 @@ v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x34,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -670,6 +942,14 @@ v_cmp_gt_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x44,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -690,6 +970,14 @@ v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x44,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -709,6 +997,14 @@ v_cmp_gt_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -729,6 +1025,14 @@ v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -748,6 +1052,14 @@ v_cmp_gt_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -768,6 +1080,14 @@ v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_gt_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_gt_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4c,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -787,6 +1107,14 @@ v_cmp_le_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x03,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -807,6 +1135,14 @@ v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x03,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -826,6 +1162,14 @@ v_cmp_le_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x13,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -846,6 +1190,14 @@ v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x13,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -865,6 +1217,14 @@ v_cmp_le_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x33,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -885,6 +1245,14 @@ v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x33,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -904,6 +1272,14 @@ v_cmp_le_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x43,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -924,6 +1300,14 @@ v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x43,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -943,6 +1327,14 @@ v_cmp_le_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -963,6 +1355,14 @@ v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -982,6 +1382,14 @@ v_cmp_le_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1002,6 +1410,14 @@ v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_le_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_le_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4b,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1021,6 +1437,14 @@ v_cmp_lg_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x05,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1041,6 +1465,14 @@ v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x05,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1060,6 +1492,14 @@ v_cmp_lg_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x15,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1080,6 +1520,14 @@ v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lg_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lg_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x15,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1099,6 +1547,14 @@ v_cmp_lt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x01,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1119,6 +1575,14 @@ v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x01,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1138,6 +1602,14 @@ v_cmp_lt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x11,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1158,6 +1630,14 @@ v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x11,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1177,6 +1657,14 @@ v_cmp_lt_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x31,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1197,6 +1685,14 @@ v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x31,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1216,6 +1712,14 @@ v_cmp_lt_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x41,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1236,6 +1740,14 @@ v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x41,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1255,6 +1767,14 @@ v_cmp_lt_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x39,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1275,6 +1795,14 @@ v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x39,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1294,6 +1822,14 @@ v_cmp_lt_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x49,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1314,6 +1850,14 @@ v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_lt_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_lt_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x49,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1333,6 +1877,14 @@ v_cmp_ne_i16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x35,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1353,6 +1905,14 @@ v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x35,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1372,6 +1932,14 @@ v_cmp_ne_i32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x45,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1392,6 +1960,14 @@ v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_i32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_i32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x45,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1411,6 +1987,14 @@ v_cmp_ne_u16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x3d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1431,6 +2015,14 @@ v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u16_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x3d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1450,6 +2042,14 @@ v_cmp_ne_u32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s5, v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x4d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1470,6 +2070,14 @@ v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ne_u32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ne_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x4d,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1489,6 +2097,14 @@ v_cmp_neq_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1509,6 +2125,14 @@ v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1528,6 +2152,14 @@ v_cmp_neq_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1548,6 +2180,14 @@ v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_neq_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_neq_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1567,6 +2207,14 @@ v_cmp_nge_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x09,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1587,6 +2235,14 @@ v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x09,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1606,6 +2262,14 @@ v_cmp_nge_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x19,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1626,6 +2290,14 @@ v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nge_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nge_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x19,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1645,6 +2317,14 @@ v_cmp_ngt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1665,6 +2345,14 @@ v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1684,6 +2372,14 @@ v_cmp_ngt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1704,6 +2400,14 @@ v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_ngt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1b,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1723,6 +2427,14 @@ v_cmp_nle_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1743,6 +2455,14 @@ v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1762,6 +2482,14 @@ v_cmp_nle_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1782,6 +2510,14 @@ v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nle_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nle_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1c,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1801,6 +2537,14 @@ v_cmp_nlg_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1821,6 +2565,14 @@ v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1840,6 +2592,14 @@ v_cmp_nlg_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1860,6 +2620,14 @@ v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlg_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1879,6 +2647,14 @@ v_cmp_nlt_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x0e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1899,6 +2675,14 @@ v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x0e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1918,6 +2702,14 @@ v_cmp_nlt_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x1e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1938,6 +2730,14 @@ v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_nlt_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x1e,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1957,6 +2757,14 @@ v_cmp_o_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x07,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1977,6 +2785,14 @@ v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x07,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -1996,6 +2812,14 @@ v_cmp_o_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x17,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2016,6 +2840,14 @@ v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_o_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_o_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x17,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2035,6 +2867,14 @@ v_cmp_u_f16_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x08,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2055,6 +2895,14 @@ v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f16_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x08,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2074,6 +2922,14 @@ v_cmp_u_f32_e64_dpp s5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s5, v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s5, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W32: [0x05,0x00,0x18,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s105, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x69,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
@@ -2094,6 +2950,14 @@ v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
 
+v_cmp_u_f32_e64_dpp s[10:11], v1, s2 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_cmp_u_f32_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0]
+// W64: [0x0a,0x00,0x18,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
+
 v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W64: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 // W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
index 03958ba..eae2d5b2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
@@ -4,6 +4,12 @@
 v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_class_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_class_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_class_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -46,6 +52,12 @@ v_cmpx_class_f16_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_class_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_class_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_class_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -88,6 +100,12 @@ v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 b
 v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -130,6 +148,12 @@ v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -172,6 +196,12 @@ v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -214,6 +244,12 @@ v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -256,6 +292,12 @@ v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -298,6 +340,12 @@ v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_eq_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_eq_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -340,6 +388,12 @@ v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -382,6 +436,12 @@ v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -424,6 +484,12 @@ v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -466,6 +532,12 @@ v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -508,6 +580,12 @@ v_cmpx_ge_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -550,6 +628,12 @@ v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ge_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ge_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -592,6 +676,12 @@ v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -634,6 +724,12 @@ v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -676,6 +772,12 @@ v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -718,6 +820,12 @@ v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -760,6 +868,12 @@ v_cmpx_gt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -802,6 +916,12 @@ v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_gt_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_gt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -844,6 +964,12 @@ v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -886,6 +1012,12 @@ v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -928,6 +1060,12 @@ v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -970,6 +1108,12 @@ v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1012,6 +1156,12 @@ v_cmpx_le_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1054,6 +1204,12 @@ v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_le_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_le_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1096,6 +1252,12 @@ v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1138,6 +1300,12 @@ v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lg_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1180,6 +1348,12 @@ v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1222,6 +1396,12 @@ v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1264,6 +1444,12 @@ v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask
 v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1306,6 +1492,12 @@ v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1348,6 +1540,12 @@ v_cmpx_lt_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1390,6 +1588,12 @@ v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_lt_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lt_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_lt_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1432,6 +1636,12 @@ v_cmpx_lt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_i16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_i16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_i16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1474,6 +1684,12 @@ v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_i32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_i32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_i32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1516,6 +1732,12 @@ v_cmpx_ne_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_u16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_u16_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_u16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1558,6 +1780,12 @@ v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ne_u32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ne_u32_e64_dpp v1, 10 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x14,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ne_u32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1600,6 +1828,12 @@ v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_c
 v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1642,6 +1876,12 @@ v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_neq_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1684,6 +1924,12 @@ v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1726,6 +1972,12 @@ v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nge_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1768,6 +2020,12 @@ v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1810,6 +2068,12 @@ v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_ngt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1852,6 +2116,12 @@ v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1894,6 +2164,12 @@ v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nle_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1936,6 +2212,12 @@ v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -1978,6 +2260,12 @@ v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlg_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2020,6 +2308,12 @@ v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2062,6 +2356,12 @@ v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_nlt_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2104,6 +2404,12 @@ v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mas
 v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2146,6 +2452,12 @@ v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_o_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2188,6 +2500,12 @@ v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
@@ -2230,6 +2548,12 @@ v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
+v_cmpx_u_f32_e64_dpp v1, s2 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f32_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
+// GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[0,1,2,3]
 // GFX12: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
index efc6168..d63ca0c 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
@@ -7,6 +7,12 @@ v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_class_f16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfd,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x01,0xfd,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
@@ -16,6 +22,12 @@ v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_class_f32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_class_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xfe,0xd4,0xea,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
@@ -28,6 +40,12 @@ v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -40,6 +58,12 @@ v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x92,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x92,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x92,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x92,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -49,6 +73,12 @@ v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -58,6 +88,12 @@ v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc2,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc2,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -67,6 +103,12 @@ v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xba,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xba,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -76,6 +118,12 @@ v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_eq_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xca,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -88,6 +136,12 @@ v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -100,6 +154,12 @@ v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x96,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x96,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x96,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x96,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -109,6 +169,12 @@ v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -118,6 +184,12 @@ v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc6,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc6,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -127,6 +199,12 @@ v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbe,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbe,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -136,6 +214,12 @@ v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ge_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xce,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -148,6 +232,12 @@ v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -160,6 +250,12 @@ v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x94,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x94,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x94,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x94,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -169,6 +265,12 @@ v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -178,6 +280,12 @@ v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc4,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc4,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -187,6 +295,12 @@ v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -196,6 +310,12 @@ v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_gt_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcc,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -208,6 +328,12 @@ v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -220,6 +346,12 @@ v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x93,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x93,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x93,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x93,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -229,6 +361,12 @@ v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -238,6 +376,12 @@ v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc3,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc3,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -247,6 +391,12 @@ v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -256,6 +406,12 @@ v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_le_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcb,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -268,6 +424,12 @@ v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -280,6 +442,12 @@ v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x95,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lg_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x95,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x95,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x95,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -292,6 +460,12 @@ v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x81,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x81,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x81,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -304,6 +478,12 @@ v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x91,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x91,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x91,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x91,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -313,6 +493,12 @@ v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -322,6 +508,12 @@ v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc1,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc1,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -331,6 +523,12 @@ v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -340,6 +538,12 @@ v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_lt_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc9,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc9,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -349,6 +553,12 @@ v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_i16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_i16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xb5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xb5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -358,6 +568,12 @@ v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_i32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xc5,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xc5,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -367,6 +583,12 @@ v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_u16_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_u16_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xbd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xbd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -376,6 +598,12 @@ v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
+v_cmpx_ne_u32_e64_dpp v1, s2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ne_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x00,0xcd,0xd4,0xea,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+
 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
@@ -388,6 +616,12 @@ v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -400,6 +634,12 @@ v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_neq_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -412,6 +652,12 @@ v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -424,6 +670,12 @@ v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x99,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nge_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x99,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x99,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -436,6 +688,12 @@ v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -448,6 +706,12 @@ v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_ngt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -460,6 +724,12 @@ v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -472,6 +742,12 @@ v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nle_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -484,6 +760,12 @@ v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -496,6 +778,12 @@ v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlg_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -508,6 +796,12 @@ v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -520,6 +814,12 @@ v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_nlt_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x9e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -532,6 +832,12 @@ v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -544,6 +850,12 @@ v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x97,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_o_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x97,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x97,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -556,6 +868,12 @@ v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
@@ -568,5 +886,11 @@ v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_u_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x7e,0x02,0x98,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
+v_cmpx_u_f32_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x98,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x7e,0x02,0x98,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+
 v_cmpx_u_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0x7e,0x83,0x98,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/vop_dpp.s b/llvm/test/MC/AMDGPU/vop_dpp.s
index b2251f5..a15a48e 100644
--- a/llvm/test/MC/AMDGPU/vop_dpp.s
+++ b/llvm/test/MC/AMDGPU/vop_dpp.s
@@ -648,8 +648,8 @@ v_mov_b32 v0, s1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 v_and_b32 v0, s42, v1 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 // NOSICI: :[[@LINE+3]]:{{[0-9]+}}: error: not a valid operand.
-// NOVI: :[[@LINE+2]]:{{[0-9]+}}: error: invalid operand for instruction
-// NOGFX9: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+// NOVI: :[[@LINE+2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// NOGFX9: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 v_add_f32 v0, v1, s45 row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
index 6ab3e08..52426d3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
@@ -17,3 +17,12 @@
 
 # GFX1150: v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
 0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05
+
+# GFX1150: v_add_f32_e64_dpp v5, v1, s2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff]
+0x05,0x00,0x03,0xd5,0xfa,0x04,0x00,0x00,0x01,0x40,0x01,0xff
+
+# GFX1150: v_min3_f16_e64_dpp v5, v1, s2, 2.0 op_sel:[1,1,0,1] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff]
+0x05,0x58,0x49,0xd6,0xfa,0x04,0xd0,0x03,0x01,0x55,0x00,0xff
+
+# GFX1150: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
index 1be97b2..1d69134 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
@@ -22,3 +22,7 @@
 # This is more strict than the check in vinterp-fake16.txt and is GFX12 specific.
 # GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
 0x00,0x00,0xe0,0xcd,0x01,0x05,0x0e,0x1c
+
+# Regression test for future fixes to VOPC _e64_dpp src1
+# GFX12: v_cmp_le_f32 vcc_lo, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff]
+0xfa,0x04,0x26,0x7c,0x01,0x40,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 4303c6d..0771e64 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -4,6 +4,12 @@
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add3_u32_e64_dpp v5, v1, 15, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x1e,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x1e,0x0d,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x55,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x55,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -101,6 +107,9 @@
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_lshl_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x47,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x47,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -185,6 +194,9 @@
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_alignbit_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x16,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -227,6 +239,9 @@
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_alignbyte_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x17,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x17,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -311,6 +326,9 @@
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_and_or_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x57,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x57,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -437,6 +455,9 @@
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfe_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x11,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x11,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -479,6 +500,9 @@
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfe_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x10,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x10,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -521,6 +545,9 @@
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_bfi_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x12,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x12,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -660,6 +687,9 @@
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubeid_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -702,6 +732,9 @@
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubema_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -744,6 +777,9 @@
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubesc_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -786,6 +822,9 @@
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cubetc_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1104,6 +1143,9 @@
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x26,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x26,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1230,6 +1272,9 @@
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_fma_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x13,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x13,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1314,6 +1359,9 @@
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lerp_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x15,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x15,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1356,6 +1404,9 @@
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshl_add_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x46,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x46,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1398,6 +1449,9 @@
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshl_or_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x56,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x56,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1524,6 +1578,9 @@
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i32_i24_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1566,6 +1623,9 @@
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u32_u24_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1608,6 +1668,9 @@
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1650,6 +1713,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1668,6 +1734,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, 15, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x1e,0xa9,0x01,0x01,0x11,0x01,0xff]
+0x05,0x00,0x1d,0xd6,0xfa,0x1e,0xa9,0x01,0x01,0x11,0x01,0xff
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x1d,0xd6,0xfa,0x04,0xee,0x01,0x01,0x1f,0x01,0xff
 
@@ -1692,6 +1761,9 @@
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1818,6 +1890,12 @@
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, -2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6b,0xd6,0xfa,0xea,0x0d,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1860,6 +1938,9 @@
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x69,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x69,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1902,6 +1983,9 @@
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x64,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x64,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -1944,6 +2028,9 @@
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maxmin_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x62,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x62,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2070,6 +2157,9 @@
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x31,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x31,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2112,6 +2202,9 @@
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x20,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x20,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2154,6 +2247,9 @@
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x21,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x21,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2196,6 +2292,9 @@
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x29,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x29,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2238,6 +2337,9 @@
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2280,6 +2382,9 @@
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2406,6 +2511,9 @@
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2448,6 +2556,9 @@
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_num_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x68,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x68,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2490,6 +2601,9 @@
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_i32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x65,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x65,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2532,6 +2646,9 @@
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minmax_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x63,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x63,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2574,6 +2691,9 @@
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_msad_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x39,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x39,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2658,6 +2778,9 @@
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mullit_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x18,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x18,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2700,6 +2823,9 @@
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_or3_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x58,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x58,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2784,6 +2910,9 @@
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_perm_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x44,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x44,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2826,6 +2955,9 @@
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_hi_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x23,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x23,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2868,6 +3000,9 @@
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x24,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x24,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2910,6 +3045,9 @@
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x25,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x25,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -2952,6 +3090,9 @@
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sad_u8_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x22,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x22,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3146,6 +3287,9 @@
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xad_u32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x45,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x45,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3188,6 +3332,9 @@
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xor3_b32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x40,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x40,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3440,6 +3587,9 @@
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x54,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_div_fixup_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x54,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x54,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x54,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3482,6 +3632,9 @@
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x48,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x48,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x48,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3524,6 +3677,9 @@
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x53,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x53,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3566,6 +3722,9 @@
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x5a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_i32_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x5a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x5a,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3608,6 +3767,9 @@
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x41,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x41,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3650,6 +3812,9 @@
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x59,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mad_u32_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x59,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x59,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x59,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3692,6 +3857,9 @@
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2c,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3734,6 +3902,9 @@
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3776,6 +3947,9 @@
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3818,6 +3992,9 @@
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x32,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x32,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x32,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3860,6 +4037,9 @@
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x50,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x50,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3902,6 +4082,9 @@
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_med3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x51,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x51,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -3944,6 +4127,9 @@
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2b,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
 
@@ -3986,6 +4172,9 @@
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_i16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4a,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -4028,6 +4217,9 @@
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min3_u16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x4b,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x4b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
 
@@ -4417,6 +4609,9 @@
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximum3_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4459,6 +4654,9 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimum3_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4501,6 +4699,9 @@
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4543,6 +4744,9 @@
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4585,6 +4789,9 @@
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximumminimum_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6d,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4627,6 +4834,9 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimummaximum_f32_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6c,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4669,6 +4879,9 @@
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
@@ -4711,6 +4924,9 @@
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 
+# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
+
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index c73ffe7..a836ada 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -4,6 +4,12 @@
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_add3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
+# GFX12: v_add3_u32_e64_dpp v5, v1, 10, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x55,0xd6,0xe9,0x14,0x0d,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_add3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -59,6 +65,9 @@
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x47,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_lshl_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x47,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_lshl_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x47,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -101,6 +110,9 @@
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_alignbit_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x16,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_alignbit_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -134,6 +146,9 @@
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x17,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_alignbyte_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x17,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_alignbyte_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x17,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -173,6 +188,9 @@
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x57,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_and_or_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x57,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_and_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x57,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -221,6 +239,9 @@
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfe_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x11,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfe_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -257,6 +278,9 @@
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x10,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfe_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x10,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfe_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x10,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -293,6 +317,9 @@
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_bfi_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x12,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_bfi_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -354,6 +381,9 @@
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubeid_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -390,6 +420,9 @@
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubema_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubema_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -426,6 +459,9 @@
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubesc_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubesc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -462,6 +498,9 @@
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cubetc_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cubetc_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -582,6 +621,9 @@
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x26,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_u8_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -642,6 +684,9 @@
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_fma_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x13,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_fma_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -690,6 +735,9 @@
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lerp_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x15,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lerp_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -726,6 +774,9 @@
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x46,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshl_add_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x46,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshl_add_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x46,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -762,6 +813,9 @@
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x56,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshl_or_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x56,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshl_or_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x56,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -810,6 +864,9 @@
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i32_i24_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i32_i24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -846,6 +903,9 @@
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u32_u24_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u32_u24_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -882,6 +942,9 @@
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -918,6 +981,9 @@
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -954,6 +1020,9 @@
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1002,6 +1071,9 @@
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1038,6 +1110,9 @@
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x69,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x69,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1074,6 +1149,9 @@
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x64,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x64,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x64,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1110,6 +1188,9 @@
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x62,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maxmin_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x62,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maxmin_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x62,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1158,6 +1239,9 @@
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x31,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1194,6 +1278,9 @@
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x20,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x20,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x20,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1230,6 +1317,9 @@
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x21,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x21,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x21,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1266,6 +1356,9 @@
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x29,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x29,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x29,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1302,6 +1395,9 @@
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1338,6 +1434,9 @@
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1386,6 +1485,9 @@
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1422,6 +1524,9 @@
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x68,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_num_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x68,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_num_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x68,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1458,6 +1563,9 @@
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x65,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_i32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x65,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_i32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x65,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1494,6 +1602,9 @@
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x63,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minmax_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x63,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minmax_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x63,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1530,6 +1641,9 @@
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_msad_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x39,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_msad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x39,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1572,6 +1686,9 @@
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mullit_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x18,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mullit_f32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1608,6 +1725,9 @@
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x58,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_or3_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x58,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_or3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x58,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1650,6 +1770,9 @@
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x44,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_perm_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x44,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_perm_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x44,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1686,6 +1809,9 @@
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x23,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_hi_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x23,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_hi_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x23,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1722,6 +1848,9 @@
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x24,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x24,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x24,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1758,6 +1887,9 @@
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x25,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1794,6 +1926,9 @@
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x22,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_sad_u8_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x22,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_sad_u8_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x22,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1874,6 +2009,9 @@
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x45,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_xad_u32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x45,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_xad_u32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x45,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -1910,6 +2048,9 @@
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x40,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_xor3_b32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x40,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_xor3_b32_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x40,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2006,6 +2147,9 @@
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x54,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_div_fixup_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x54,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_div_fixup_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x54,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2048,6 +2192,12 @@
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x48,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_fma_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x48,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
+# GFX12: v_fma_f16_e64_dpp v5, v1, 4.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x48,0xd6,0xe9,0xec,0x0d,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_fma_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x48,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2090,6 +2240,9 @@
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x53,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x53,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x53,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2129,6 +2282,9 @@
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x5a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_i32_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x5a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_i32_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x5a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2171,6 +2327,9 @@
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x41,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x41,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x41,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2210,6 +2369,9 @@
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x59,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_mad_u32_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x59,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_mad_u32_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x59,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2252,6 +2414,9 @@
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2c,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2294,6 +2459,9 @@
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4d,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2333,6 +2501,9 @@
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_max3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_max3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2372,6 +2543,9 @@
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x32,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2414,6 +2588,9 @@
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x50,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x50,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x50,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2453,6 +2630,9 @@
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x51,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_med3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x51,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_med3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x51,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2492,6 +2672,9 @@
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_num_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_num_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2534,6 +2717,9 @@
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_i16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4a,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_i16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4a,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2573,6 +2759,9 @@
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4b,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_min3_u16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x4b,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_min3_u16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 0x05,0x00,0x4b,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
 
@@ -2752,6 +2941,9 @@
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximum3_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2788,6 +2980,9 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimum3_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2824,6 +3019,9 @@
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2860,6 +3058,9 @@
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2896,6 +3097,9 @@
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximumminimum_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6d,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2932,6 +3136,9 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimummaximum_f32_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimummaximum_f32_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6c,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -2968,6 +3175,9 @@
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
@@ -3004,6 +3214,9 @@
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 
+# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
+
 # GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
index 56d7805b..b10b8da 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp16.txt
@@ -59,6 +59,9 @@
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x32,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x32,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -101,6 +104,9 @@
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x03,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -143,6 +149,9 @@
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_add_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x25,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x25,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -185,6 +194,9 @@
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_and_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -227,6 +239,9 @@
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_ashrrev_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -270,6 +285,10 @@
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
 
+# W32: v_cndmask_b32_e64_dpp v5, v1, s3, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff]
+# W64: v_cndmask_b32_e64_dpp v5, v1, s3, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x01,0xd5,0xfa,0x06,0x18,0x00,0x01,0x1b,0x00,0xff
+
 # W32: v_cndmask_b32_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x01,0xd5,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
@@ -324,6 +343,9 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x2f,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x2f,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -366,6 +388,9 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x3b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -390,6 +415,9 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, 2.0 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x21,0x01,0xff]
+0x05,0x00,0x3b,0xd5,0xfa,0xe8,0x01,0x00,0x01,0x21,0x01,0xff
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x3b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
@@ -408,6 +436,9 @@
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshlrev_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x18,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x18,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -450,6 +481,9 @@
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_lshrrev_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x19,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x19,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -492,6 +526,9 @@
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_num_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x31,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x31,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -534,6 +571,9 @@
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_num_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x16,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x16,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -576,6 +616,9 @@
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x12,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x12,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -618,6 +661,9 @@
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_max_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x14,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x14,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -660,6 +706,9 @@
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_num_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x30,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x30,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -702,6 +751,9 @@
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_num_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x15,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x15,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -744,6 +796,9 @@
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_i32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x11,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x11,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -786,6 +841,9 @@
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_min_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x13,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x13,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -828,6 +886,9 @@
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x07,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x07,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -870,6 +931,9 @@
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x35,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x35,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -912,6 +976,9 @@
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x08,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x08,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -954,6 +1021,9 @@
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0a,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0a,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -996,6 +1066,9 @@
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1038,6 +1111,9 @@
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_i32_i24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x09,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x09,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1080,6 +1156,9 @@
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_mul_u32_u24_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x0b,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0b,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1122,6 +1201,9 @@
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_or_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1c,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1c,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1219,6 +1301,9 @@
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x33,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x33,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1261,6 +1346,9 @@
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x04,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1303,6 +1391,9 @@
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_sub_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x26,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x26,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1400,6 +1491,9 @@
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_f16_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x34,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x34,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1442,6 +1536,9 @@
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_f32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x05,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x05,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1484,6 +1581,9 @@
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_subrev_nc_u32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x27,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x27,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1526,6 +1626,9 @@
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xnor_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1e,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1e,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
@@ -1568,6 +1671,9 @@
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
+# GFX12: v_xor_b32_e64_dpp v5, v1, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff]
+0x05,0x00,0x1d,0xd5,0xfa,0x06,0x00,0x00,0x01,0x1b,0x00,0xff
+
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x1d,0xd5,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
index da7faa8..f78106e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop2_dpp8.txt
@@ -23,6 +23,9 @@
 # GFX12: v_add_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x32,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x32,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x32,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x32,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -35,6 +38,9 @@
 # GFX12: v_add_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x03,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x03,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -47,18 +53,27 @@
 # GFX12: v_add_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x25,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_add_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x25,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x25,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_add_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x25,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x25,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_and_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_and_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_and_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_ashrrev_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_ashrrev_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_ashrrev_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -66,6 +81,10 @@
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x01,0xd5,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cndmask_b32_e64_dpp v5, v1, s3, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cndmask_b32_e64_dpp v5, v1, s3, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x01,0xd5,0xe9,0x06,0x18,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cndmask_b32_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 # W64: v_cndmask_b32_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 0x05,0x00,0x01,0xd5,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
@@ -84,6 +103,9 @@
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x2f,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x2f,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cvt_pk_rtz_f16_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x05,0x01,0x2f,0xd5,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -96,30 +118,48 @@
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x08,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x08,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x08,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v5, v1, v2 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05]
 0x05,0x00,0x3b,0xd5,0xe9,0x04,0x02,0x10,0x01,0x77,0x39,0x05
 
+# GFX12: v_ldexp_f16_e64_dpp v5, v1, s3 mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x10,0x01,0x77,0x39,0x05]
+0x05,0x00,0x3b,0xd5,0xe9,0x06,0x00,0x10,0x01,0x77,0x39,0x05
+
 # GFX12: v_ldexp_f16_e64_dpp v255, -|v255|, v255 clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x81,0x3b,0xd5,0xea,0xfe,0x03,0x38,0xff,0x00,0x00,0x00]
 0xff,0x81,0x3b,0xd5,0xea,0xfe,0x03,0x38,0xff,0x00,0x00,0x00
 
 # GFX12: v_lshlrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x18,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshlrev_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x18,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x18,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshlrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x18,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x18,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_lshrrev_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x19,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_lshrrev_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x19,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x19,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_lshrrev_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x19,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x19,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_max_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x31,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_num_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x31,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x31,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x31,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -132,6 +172,9 @@
 # GFX12: v_max_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x16,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_num_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x16,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x16,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x16,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -144,18 +187,27 @@
 # GFX12: v_max_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x12,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x12,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x12,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x12,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_max_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x14,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_max_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x14,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x14,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_max_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x14,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x14,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_min_num_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x30,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_num_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x30,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_num_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x30,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -168,6 +220,9 @@
 # GFX12: v_min_num_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x15,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_num_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x15,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x15,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_num_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x15,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -180,18 +235,27 @@
 # GFX12: v_min_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x11,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_i32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x11,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x11,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_i32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x11,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x11,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_min_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x13,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_min_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x13,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x13,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_min_u32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x13,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x13,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x07,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_dx9_zero_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x07,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x07,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_dx9_zero_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x07,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -204,6 +268,9 @@
 # GFX12: v_mul_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x35,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x35,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x35,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x35,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -216,6 +283,9 @@
 # GFX12: v_mul_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x08,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x08,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x08,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x08,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -228,30 +298,45 @@
 # GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0a,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_hi_i32_i24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0a,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_hi_i32_i24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0a,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_hi_u32_u24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_hi_u32_u24_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x0c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_i32_i24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x09,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_i32_i24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x09,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x09,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_i32_i24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x09,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x09,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_mul_u32_u24_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0b,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_mul_u32_u24_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x0b,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_mul_u32_u24_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x0b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x0b,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_or_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1c,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_or_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1c,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_or_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1c,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -277,6 +362,9 @@
 # GFX12: v_sub_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x33,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x33,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x33,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x33,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -289,6 +377,9 @@
 # GFX12: v_sub_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x04,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x04,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -301,6 +392,9 @@
 # GFX12: v_sub_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x26,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_sub_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x26,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x26,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_sub_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x26,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x26,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
@@ -326,6 +420,9 @@
 # GFX12: v_subrev_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x34,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_f16_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x34,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x34,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_f16_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x34,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -338,6 +435,9 @@
 # GFX12: v_subrev_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x05,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_f32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x05,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x05,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_f32_e64_dpp v5, |v1|, -v2 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05]
 0x05,0x01,0x05,0xd5,0xe9,0x04,0x02,0x48,0x01,0x77,0x39,0x05
 
@@ -350,17 +450,26 @@
 # GFX12: v_subrev_nc_u32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x27,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_subrev_nc_u32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x27,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x27,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_subrev_nc_u32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x80,0x27,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x80,0x27,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_xnor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1e,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_xnor_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1e,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1e,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_xnor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1e,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1e,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_xor_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x1d,0xd5,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_xor_b32_e64_dpp v5, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x1d,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x1d,0xd5,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_xor_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x1d,0xd5,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
index e6ea6da..13e34ca 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp16.txt
@@ -21,6 +21,10 @@
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_class_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x7d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_class_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x7d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -76,6 +80,10 @@
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_class_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x7e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_class_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x7e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -131,6 +139,10 @@
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x02,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x02,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -186,6 +198,10 @@
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x12,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x12,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -241,6 +257,10 @@
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x32,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x32,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -296,6 +316,10 @@
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x42,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x42,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -351,6 +375,10 @@
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -406,6 +434,10 @@
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_eq_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -461,6 +493,10 @@
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x06,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x06,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -516,6 +552,10 @@
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x16,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x16,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -571,6 +611,10 @@
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x36,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x36,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -606,6 +650,9 @@
 # GFX12: v_cmp_ge_i16_e64_dpp null, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7c,0x00,0x36,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cmp_ge_i16_e64_dpp null, v255, 10 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30]
+0x7c,0x00,0x36,0xd4,0xfa,0x14,0x01,0x00,0xff,0x6f,0x0d,0x30
+
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
@@ -626,6 +673,10 @@
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x46,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x46,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -681,6 +732,10 @@
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -736,6 +791,10 @@
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ge_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -791,6 +850,10 @@
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x04,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x04,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -846,6 +909,10 @@
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x14,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x14,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -901,6 +968,10 @@
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x34,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x34,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -956,6 +1027,10 @@
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x44,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x44,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1011,6 +1086,10 @@
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1066,6 +1145,10 @@
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_gt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1121,6 +1204,10 @@
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x03,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x03,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1176,6 +1263,10 @@
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x13,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x13,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1231,6 +1322,10 @@
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x33,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x33,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1286,6 +1381,10 @@
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x43,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x43,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1341,6 +1440,10 @@
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1396,6 +1499,10 @@
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_le_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_le_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1451,6 +1558,10 @@
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x05,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x05,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1506,6 +1617,10 @@
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x15,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x15,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1561,6 +1676,10 @@
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x01,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x01,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1616,6 +1735,10 @@
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x11,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x11,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1671,6 +1794,10 @@
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x31,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x31,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1726,6 +1853,10 @@
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x41,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x41,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1781,6 +1912,10 @@
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x39,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x39,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1836,6 +1971,10 @@
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x49,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_lt_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x49,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1891,6 +2030,10 @@
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x35,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_i16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x35,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -1946,6 +2089,10 @@
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x45,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_i32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x45,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2001,6 +2148,10 @@
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x3d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_u16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x3d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2056,6 +2207,10 @@
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x4d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ne_u32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x4d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2111,6 +2266,10 @@
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_neq_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2166,6 +2325,10 @@
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_neq_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2221,6 +2384,10 @@
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x09,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nge_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x09,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2276,6 +2443,10 @@
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x19,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nge_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x19,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2331,6 +2502,10 @@
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ngt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2386,6 +2561,10 @@
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_ngt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2441,6 +2620,10 @@
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nle_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2496,6 +2679,10 @@
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nle_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2551,6 +2738,10 @@
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlg_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2606,6 +2797,10 @@
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlg_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2661,6 +2856,10 @@
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x0e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlt_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x0e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2716,6 +2915,10 @@
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x1e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_nlt_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x1e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2771,6 +2974,10 @@
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_o_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x07,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_o_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x07,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2826,6 +3033,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_o_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x17,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_o_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2850,6 +3061,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x68,0x00,0x17,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
+# W32: v_cmp_o_f32_e64_dpp s104, v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
+# W64: v_cmp_o_f32_e64_dpp s[104:105], v1, 2.0 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff]
+0x68,0x00,0x17,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x50,0x01,0xff
+
 # W32: v_cmp_o_f32_e64_dpp vcc_lo, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 # W64: v_cmp_o_f32_e64_dpp vcc, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 0x6a,0x01,0x17,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
@@ -2881,6 +3096,10 @@
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_u_f16_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x08,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_u_f16_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x08,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
@@ -2936,6 +3155,10 @@
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
+# W32: v_cmp_u_f32_e64_dpp s10, v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+# W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff]
+0x0a,0x00,0x18,0xd4,0xfa,0x06,0x00,0x00,0x01,0x01,0x01,0xff
+
 # W32: v_cmp_u_f32_e64_dpp s10, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x0a,0x00,0x18,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
index 98f8fd9..f36857b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3c_dpp8.txt
@@ -5,6 +5,14 @@
 # W64: v_cmp_class_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_class_f16_e64_dpp s10, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7d,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# W32: v_cmp_class_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_class_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x7d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -24,6 +32,10 @@
 # W64: v_cmp_class_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_class_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_class_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x7e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_class_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_class_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x7e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -43,6 +55,10 @@
 # W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x02,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x02,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -62,6 +78,10 @@
 # W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x12,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x12,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -81,6 +101,10 @@
 # W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x32,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x32,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -100,6 +124,10 @@
 # W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x42,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x42,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -119,6 +147,10 @@
 # W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -138,6 +170,10 @@
 # W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_eq_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_eq_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_eq_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_eq_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -157,6 +193,10 @@
 # W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x06,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x06,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -176,6 +216,10 @@
 # W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x16,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x16,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -184,6 +228,10 @@
 # W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x6a,0x01,0x16,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_f32_e64_dpp vcc_lo, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_f32_e64_dpp vcc, |v1|, -2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05]
+0x6a,0x01,0x16,0xd4,0xe9,0xea,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_f32_e64_dpp ttmp14, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_f32_e64_dpp ttmp[14:15], -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 0x7a,0x02,0x16,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
@@ -195,6 +243,10 @@
 # W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x36,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x36,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -214,6 +266,10 @@
 # W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x46,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x46,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -233,6 +289,10 @@
 # W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -252,6 +312,14 @@
 # W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
+# W32: v_cmp_ge_u32_e64_dpp s10, v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ge_u32_e64_dpp s[10:11], v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4e,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ge_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ge_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -271,6 +339,10 @@
 # W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x04,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x04,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -290,6 +362,10 @@
 # W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x14,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x14,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -309,6 +385,10 @@
 # W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x34,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x34,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -328,6 +408,10 @@
 # W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x44,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x44,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -347,6 +431,10 @@
 # W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -366,6 +454,10 @@
 # W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_gt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_gt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_gt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_gt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -385,6 +477,10 @@
 # W64: v_cmp_le_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x03,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x03,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -404,6 +500,10 @@
 # W64: v_cmp_le_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x13,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x13,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -423,6 +523,10 @@
 # W64: v_cmp_le_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x33,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x33,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -442,6 +546,10 @@
 # W64: v_cmp_le_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x43,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x43,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -461,6 +569,10 @@
 # W64: v_cmp_le_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -480,6 +592,10 @@
 # W64: v_cmp_le_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_le_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_le_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_le_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_le_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -499,6 +615,10 @@
 # W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x05,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x05,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -518,6 +638,10 @@
 # W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x15,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x15,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -537,6 +661,10 @@
 # W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x01,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x01,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -556,6 +684,10 @@
 # W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x11,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x11,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -575,6 +707,10 @@
 # W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x31,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x31,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -594,6 +730,10 @@
 # W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x41,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x41,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -613,6 +753,10 @@
 # W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x39,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x39,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -632,6 +776,10 @@
 # W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_lt_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_lt_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x49,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_lt_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_lt_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x49,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -651,6 +799,10 @@
 # W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_i16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_i16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x35,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_i16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x35,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -670,6 +822,10 @@
 # W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_i32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_i32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x45,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_i32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_i32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x45,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -689,6 +845,10 @@
 # W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_u16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_u16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x3d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_u16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x3d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -708,6 +868,10 @@
 # W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ne_u32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ne_u32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x4d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ne_u32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ne_u32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x4d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -727,6 +891,10 @@
 # W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_neq_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_neq_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_neq_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -746,6 +914,10 @@
 # W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_neq_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_neq_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_neq_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_neq_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -765,6 +937,10 @@
 # W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nge_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nge_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x09,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nge_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x09,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -784,6 +960,10 @@
 # W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nge_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nge_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x19,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nge_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nge_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x19,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -803,6 +983,10 @@
 # W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ngt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ngt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ngt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -822,6 +1006,10 @@
 # W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_ngt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_ngt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_ngt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_ngt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -841,6 +1029,10 @@
 # W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nle_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nle_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nle_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -860,6 +1052,10 @@
 # W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nle_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nle_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nle_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nle_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -879,6 +1075,10 @@
 # W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -898,10 +1098,18 @@
 # W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlg_f32_e64_dpp s104, v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlg_f32_e64_dpp s[104:105], v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x68,0x00,0x1a,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlg_f32_e64_dpp vcc_lo, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlg_f32_e64_dpp vcc, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x6a,0x01,0x1a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
@@ -917,6 +1125,10 @@
 # W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlt_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlt_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x0e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlt_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x0e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -936,6 +1148,10 @@
 # W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_nlt_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_nlt_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x1e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_nlt_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_nlt_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x1e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -955,6 +1171,10 @@
 # W64: v_cmp_o_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_o_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_o_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x07,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_o_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x07,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -974,6 +1194,10 @@
 # W64: v_cmp_o_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_o_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_o_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x17,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_o_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_o_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x17,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -993,6 +1217,10 @@
 # W64: v_cmp_u_f16_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_u_f16_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_u_f16_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x08,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_u_f16_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f16_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x08,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
@@ -1012,6 +1240,10 @@
 # W64: v_cmp_u_f32_e64_dpp s[10:11], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x0a,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# W32: v_cmp_u_f32_e64_dpp s10, v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# W64: v_cmp_u_f32_e64_dpp s[10:11], v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x0a,0x00,0x18,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # W32: v_cmp_u_f32_e64_dpp s104, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # W64: v_cmp_u_f32_e64_dpp s[104:105], v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x68,0x00,0x18,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index eb7675f..0f933f0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -31,6 +31,9 @@
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xfd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -73,6 +76,9 @@
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xfe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xfe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -115,6 +121,9 @@
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -157,6 +166,9 @@
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x92,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -199,6 +211,9 @@
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -241,6 +256,9 @@
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc2,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -250,6 +268,9 @@
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x00,0xc2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, 10 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13]
+0x7e,0x00,0xc2,0xd4,0xfa,0x14,0x01,0x00,0x01,0x60,0x01,0x13
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x00,0xc2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
@@ -283,6 +304,9 @@
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xba,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -325,6 +349,9 @@
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xca,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xca,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -367,6 +394,9 @@
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -409,6 +439,9 @@
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x96,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -451,6 +484,9 @@
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -493,6 +529,9 @@
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc6,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -535,6 +574,9 @@
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbe,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -577,6 +619,9 @@
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xce,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xce,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -619,6 +664,9 @@
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -661,6 +709,9 @@
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x94,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -703,6 +754,9 @@
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -745,6 +799,9 @@
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc4,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -787,6 +844,9 @@
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -829,6 +889,9 @@
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcc,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -871,6 +934,9 @@
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -913,6 +979,9 @@
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x93,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -955,6 +1024,9 @@
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -997,6 +1069,9 @@
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc3,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1039,6 +1114,9 @@
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1081,6 +1159,9 @@
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcb,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1123,6 +1204,9 @@
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1165,6 +1249,9 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x95,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1207,6 +1294,9 @@
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x81,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x81,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1249,6 +1339,9 @@
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x91,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x91,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1291,6 +1384,9 @@
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1333,6 +1429,9 @@
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc1,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1375,6 +1474,9 @@
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1417,6 +1519,9 @@
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc9,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1459,6 +1564,9 @@
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xb5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1501,6 +1609,9 @@
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xc5,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xc5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1543,6 +1654,9 @@
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xbd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1585,6 +1699,9 @@
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0xcd,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0xcd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1627,6 +1744,9 @@
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1669,6 +1789,9 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1711,6 +1834,9 @@
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1753,6 +1879,9 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x99,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1795,6 +1924,9 @@
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1837,6 +1969,9 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1879,6 +2014,9 @@
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1921,6 +2059,9 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -1963,6 +2104,9 @@
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2005,6 +2149,9 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2047,6 +2194,9 @@
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2089,6 +2239,9 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x9e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2131,6 +2284,9 @@
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2173,6 +2329,9 @@
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x97,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2215,6 +2374,9 @@
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
@@ -2257,6 +2419,12 @@
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
+# GFX12: v_cmpx_u_f32_e64_dpp v1, 2.0 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x98,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x2f,0x01,0xff
+
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+0x7e,0x00,0x98,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
+
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index d5e112e..bf4f971 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -4,18 +4,27 @@
 # GFX12: v_cmpx_class_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_class_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xfd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_class_f16_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfd,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xfe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_class_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xfe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -28,6 +37,12 @@
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x92,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_eq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x92,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x92,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -40,30 +55,48 @@
 # GFX12: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc2,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xba,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xca,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_eq_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xca,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -76,6 +109,9 @@
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x96,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x96,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -88,30 +124,45 @@
 # GFX12: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc6,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbe,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xce,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ge_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xce,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -124,6 +175,9 @@
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x94,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x94,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -136,30 +190,45 @@
 # GFX12: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc4,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_gt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcc,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -172,6 +241,9 @@
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x93,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x93,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -184,30 +256,45 @@
 # GFX12: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc3,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_le_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcb,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -220,6 +307,9 @@
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x95,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x95,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -232,6 +322,9 @@
 # GFX12: v_cmpx_lt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x81,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x81,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x81,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -244,6 +337,9 @@
 # GFX12: v_cmpx_lt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x91,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x91,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x91,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -256,54 +352,84 @@
 # GFX12: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc1,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_lt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_lt_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc9,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_lt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_i16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xb5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_i32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xc5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc5,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_ne_i32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xc5,0xd4,0xe9,0x14,0x01,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_i32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xc5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_u16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xbd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0xcd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ne_u32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0xcd,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -316,6 +442,9 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_neq_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_neq_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -328,6 +457,12 @@
 # GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -340,6 +475,9 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nge_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x99,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nge_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x99,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -352,6 +490,9 @@
 # GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -364,6 +505,9 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_ngt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_ngt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -376,6 +520,9 @@
 # GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -388,6 +535,9 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nle_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nle_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -400,6 +550,9 @@
 # GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -412,6 +565,9 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlg_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlg_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -424,6 +580,9 @@
 # GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -436,6 +595,9 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_nlt_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x9e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_nlt_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x9e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -448,6 +610,9 @@
 # GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -460,6 +625,9 @@
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_o_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x97,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_o_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x97,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -472,6 +640,9 @@
 # GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
@@ -484,6 +655,9 @@
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
+# GFX12: v_cmpx_u_f32_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+0x7e,0x00,0x98,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
+
 # GFX12: v_cmpx_u_f32_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 0x7e,0x01,0x98,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
index 1a73178..d6f10e9 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6-el.txt
@@ -116,14 +116,14 @@
 0x10 0x08 0x02 0x46 # CHECK: sel.s $f0, $f1, $f2
 0x35 0x10 0x64 0x00 # CHECK: seleqz $2, $3, $4
 0x37 0x10 0x64 0x00 # CHECK: selnez $2, $3, $4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
 0x04 0x00 0x42 0x34 # CHECK: ori $2, $2, 4
 0x14 0x10 0x04 0x46 # CHECK: seleqz.s $f0, $f2, $f4
 0x14 0x10 0x24 0x46 # CHECK: seleqz.d $f0, $f2, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
index 53ea025..e1ba009 100644
--- a/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips32r6/valid-mips32r6.txt
@@ -92,8 +92,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -103,8 +103,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
index 9aeea45..a7dfbd2 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6-el.txt
@@ -140,15 +140,15 @@
 0x43 0x00 0x50 0xec # CHECK: lwupc $2, 268
 0x98 0x18 0x24 0x46 # CHECK: maddf.d $f2, $f3, $f4
 0x98 0x18 0x04 0x46 # CHECK: maddf.s $f2, $f3, $f4
-0x1d 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
-0x1d 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
+0x1e 0x10 0x24 0x46 # CHECK: max.d $f0, $f2, $f4
+0x1e 0x10 0x04 0x46 # CHECK: max.s $f0, $f2, $f4
 0x1f 0x10 0x24 0x46 # CHECK: maxa.d $f0, $f2, $f4
 0x1f 0x10 0x04 0x46 # CHECK: maxa.s $f0, $f2, $f4
 0x01 0x78 0x08 0x40 # CHECK: mfc0 $8, $15, 1
 0x1c 0x10 0x24 0x46 # CHECK: min.d $f0, $f2, $f4
 0x1c 0x10 0x04 0x46 # CHECK: min.s $f0, $f2, $f4
-0x1e 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
-0x1e 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
+0x1d 0x10 0x24 0x46 # CHECK: mina.d $f0, $f2, $f4
+0x1d 0x10 0x04 0x46 # CHECK: mina.s $f0, $f2, $f4
 0xda 0x10 0x64 0x00 # CHECK: mod $2, $3, $4
 0xdb 0x10 0x64 0x00 # CHECK: modu $2, $3, $4
 0x25 0x78 0xe0 0x03 # CHECK: move $15, $ra
diff --git a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
index 32b91c6..0030e51 100644
--- a/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
+++ b/llvm/test/MC/Disassembler/Mips/mips64r6/valid-mips64r6.txt
@@ -111,8 +111,8 @@
 0x46 0x04 0x10 0x14 # CHECK: seleqz.s $f0, $f2, $f4
 0x46 0x04 0x10 0x17 # CHECK: selnez.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1c # CHECK: min.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1d # CHECK: max.s $f0, $f2, $f4
-0x46 0x04 0x10 0x1e # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1d # CHECK: mina.s $f0, $f2, $f4
+0x46 0x04 0x10 0x1e # CHECK: max.s $f0, $f2, $f4
 0x46 0x04 0x10 0x1f # CHECK: maxa.s $f0, $f2, $f4
 0x46 0x04 0x18 0x98 # CHECK: maddf.s $f2, $f3, $f4
 0x46 0x04 0x18 0x99 # CHECK: msubf.s $f2, $f3, $f4
@@ -122,8 +122,8 @@
 0x46 0x24 0x10 0x14 # CHECK: seleqz.d $f0, $f2, $f4
 0x46 0x24 0x10 0x17 # CHECK: selnez.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1c # CHECK: min.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1d # CHECK: max.d $f0, $f2, $f4
-0x46 0x24 0x10 0x1e # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1d # CHECK: mina.d $f0, $f2, $f4
+0x46 0x24 0x10 0x1e # CHECK: max.d $f0, $f2, $f4
 0x46 0x24 0x10 0x1f # CHECK: maxa.d $f0, $f2, $f4
 0x46 0x24 0x18 0x98 # CHECK: maddf.d $f2, $f3, $f4
 0x46 0x24 0x18 0x99 # CHECK: msubf.d $f2, $f3, $f4
diff --git a/llvm/test/MC/Mips/mips32r6/valid.s b/llvm/test/MC/Mips/mips32r6/valid.s
index 0f098a1..0d705b6 100644
--- a/llvm/test/MC/Mips/mips32r6/valid.s
+++ b/llvm/test/MC/Mips/mips32r6/valid.s
@@ -170,14 +170,14 @@ a:
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
         selnez  $2,$3,$4         # CHECK: selnez $2, $3, $4 # encoding: [0x00,0x64,0x10,0x37]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
         or      $2, 4            # CHECK: ori $2, $2, 4          # encoding: [0x34,0x42,0x00,0x04]
         seleqz.s $f0, $f2, $f4   # CHECK: seleqz.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x14]
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
diff --git a/llvm/test/MC/Mips/mips64r6/valid.s b/llvm/test/MC/Mips/mips64r6/valid.s
index c50bd9e..ff6e1d7 100644
--- a/llvm/test/MC/Mips/mips64r6/valid.s
+++ b/llvm/test/MC/Mips/mips64r6/valid.s
@@ -183,14 +183,14 @@ a:
         lwupc   $2,268           # CHECK: lwupc $2, 268    # encoding: [0xec,0x50,0x00,0x43]
         maddf.d $f2,$f3,$f4      # CHECK: maddf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x98]
         maddf.s $f2,$f3,$f4      # CHECK: maddf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x98]
-        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
-        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
+        max.d   $f0, $f2, $f4    # CHECK: max.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
+        max.s   $f0, $f2, $f4    # CHECK: max.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
         maxa.d  $f0, $f2, $f4    # CHECK: maxa.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1f]
         maxa.s  $f0, $f2, $f4    # CHECK: maxa.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1f]
         min.d   $f0, $f2, $f4    # CHECK: min.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1c]
         min.s   $f0, $f2, $f4    # CHECK: min.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1c]
-        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1e]
-        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1e]
+        mina.d  $f0, $f2, $f4    # CHECK: mina.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x1d]
+        mina.s  $f0, $f2, $f4    # CHECK: mina.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x1d]
         mfc0    $8,$15,1         # CHECK: mfc0 $8, $15, 1      # encoding: [0x40,0x08,0x78,0x01]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
         modu    $2,$3,$4         # CHECK: modu $2, $3, $4  # encoding: [0x00,0x64,0x10,0xdb]
diff --git a/llvm/test/MachineVerifier/test_g_fcmp.mir b/llvm/test/MachineVerifier/test_g_fcmp.mir
index 9a73569..17be746 100644
--- a/llvm/test/MachineVerifier/test_g_fcmp.mir
+++ b/llvm/test/MachineVerifier/test_g_fcmp.mir
@@ -24,17 +24,22 @@ body:             |
     %4:_(<2 x s32>) = G_IMPLICIT_DEF
     %5:_(s1) = G_FCMP floatpred(oeq), %3, %4
 
-    ; mismatched element count
+    ; mismatched fixed element count
     ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
     %6:_(<2 x s32>) = G_IMPLICIT_DEF
     %7:_(<2 x s32>) = G_IMPLICIT_DEF
     %8:_(<4 x s1>) = G_FCMP floatpred(oeq), %6, %7
 
+    ; mismatched scalable element count
+    ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
+    %9:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %10:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %11:_(<vscale x 4 x s1>) = G_FCMP floatpred(oeq), %9, %10
 
     ; mismatched scalar element type
     ; CHECK: *** Bad machine code: Type mismatch in generic instruction ***
-    %9:_(s32) = G_FCONSTANT float 0.0
-    %10:_(s64) = G_FCONSTANT float 1.0
-    %11:_(s1) = G_FCMP floatpred(oeq), %9, %10
+    %12:_(s32) = G_FCONSTANT float 0.0
+    %13:_(s64) = G_FCONSTANT float 1.0
+    %14:_(s1) = G_FCMP floatpred(oeq), %12, %13
 
 ...
diff --git a/llvm/test/MachineVerifier/test_g_icmp.mir b/llvm/test/MachineVerifier/test_g_icmp.mir
index 7c64e25..74e3d34 100644
--- a/llvm/test/MachineVerifier/test_g_icmp.mir
+++ b/llvm/test/MachineVerifier/test_g_icmp.mir
@@ -24,17 +24,22 @@ body:             |
     %4:_(<2 x s32>) = G_IMPLICIT_DEF
     %5:_(s1) = G_ICMP intpred(eq), %3, %4
 
-    ; mismatched element count
+    ; mismatched fixed element count
     ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
     %6:_(<2 x s32>) = G_IMPLICIT_DEF
     %7:_(<2 x s32>) = G_IMPLICIT_DEF
     %8:_(<4 x s1>) = G_ICMP intpred(eq), %6, %7
 
+    ; mismatched scalable element count
+    ; CHECK: Bad machine code: Generic vector icmp/fcmp must preserve number of
+    %9:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %10:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %11:_(<vscale x 4 x s1>) = G_ICMP intpred(eq), %9, %10
 
     ; mismatched scalar element type
     ; CHECK: *** Bad machine code: Type mismatch in generic instruction ***
-    %9:_(s32) = G_CONSTANT i32 0
-    %10:_(s64) = G_CONSTANT i32 1
-    %11:_(s1) = G_ICMP intpred(eq), %9, %10
+    %12:_(s32) = G_CONSTANT i32 0
+    %13:_(s64) = G_CONSTANT i32 1
+    %14:_(s1) = G_ICMP intpred(eq), %12, %13
 
 ...
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 4ab5567..493350d 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -984,10 +984,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
   {X86::RORX64ri_EVEX, X86::RORX64mi_EVEX, 0},
-  {X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16},
-  {X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16},
-  {X86::ROUNDSDr, X86::ROUNDSDm, 0},
-  {X86::ROUNDSSr, X86::ROUNDSSm, 0},
+  {X86::ROUNDPDri, X86::ROUNDPDmi, TB_ALIGN_16},
+  {X86::ROUNDPSri, X86::ROUNDPSmi, TB_ALIGN_16},
+  {X86::ROUNDSDri, X86::ROUNDSDmi, 0},
+  {X86::ROUNDSSri, X86::ROUNDSSmi, 0},
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
   {X86::SAR16r1_ND, X86::SAR16m1_ND, 0},
@@ -1791,10 +1791,10 @@ static const X86FoldTableEntry Table1[] = {
   {X86::VRNDSCALEPSZ128rri, X86::VRNDSCALEPSZ128rmi, 0},
   {X86::VRNDSCALEPSZ256rri, X86::VRNDSCALEPSZ256rmi, 0},
   {X86::VRNDSCALEPSZrri, X86::VRNDSCALEPSZrmi, 0},
-  {X86::VROUNDPDYr, X86::VROUNDPDYm, 0},
-  {X86::VROUNDPDr, X86::VROUNDPDm, 0},
-  {X86::VROUNDPSYr, X86::VROUNDPSYm, 0},
-  {X86::VROUNDPSr, X86::VROUNDPSm, 0},
+  {X86::VROUNDPDYri, X86::VROUNDPDYmi, 0},
+  {X86::VROUNDPDri, X86::VROUNDPDmi, 0},
+  {X86::VROUNDPSYri, X86::VROUNDPSYmi, 0},
+  {X86::VROUNDPSri, X86::VROUNDPSmi, 0},
   {X86::VRSQRT14PDZ128r, X86::VRSQRT14PDZ128m, 0},
   {X86::VRSQRT14PDZ256r, X86::VRSQRT14PDZ256m, 0},
   {X86::VRSQRT14PDZr, X86::VRSQRT14PDZm, 0},
@@ -2234,8 +2234,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16},
   {X86::PXORrr, X86::PXORrm, TB_ALIGN_16},
   {X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::ROUNDSDri_Int, X86::ROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::ROUNDSSri_Int, X86::ROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16rm, 0},
   {X86::SBB16rr_ND, X86::SBB16rm_ND, 0},
@@ -3778,10 +3778,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VRNDSCALESHZr_Int, X86::VRNDSCALESHZm_Int, TB_NO_REVERSE},
   {X86::VRNDSCALESSZr, X86::VRNDSCALESSZm, 0},
   {X86::VRNDSCALESSZr_Int, X86::VRNDSCALESSZm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSDr, X86::VROUNDSDm, 0},
-  {X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE},
-  {X86::VROUNDSSr, X86::VROUNDSSm, 0},
-  {X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE},
+  {X86::VROUNDSDri, X86::VROUNDSDmi, 0},
+  {X86::VROUNDSDri_Int, X86::VROUNDSDmi_Int, TB_NO_REVERSE},
+  {X86::VROUNDSSri, X86::VROUNDSSmi, 0},
+  {X86::VROUNDSSri_Int, X86::VROUNDSSmi_Int, TB_NO_REVERSE},
   {X86::VRSQRT14PDZ128rkz, X86::VRSQRT14PDZ128mkz, 0},
   {X86::VRSQRT14PDZ256rkz, X86::VRSQRT14PDZ256mkz, 0},
   {X86::VRSQRT14PDZrkz, X86::VRSQRT14PDZmkz, 0},
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
index cc66cbe..ce1b591 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll
@@ -102,9 +102,9 @@ if.end8:
 define i32 @test4(i32 %c) nounwind {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 1, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
-; CHECK-NEXT:    i32 4, label [[SW_BB]]
+; CHECK-NEXT:      i32 1, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 4, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    br i1 true, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -207,8 +207,8 @@ define i1 @test7(i32 %c) nounwind {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[C:%.*]], label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 6, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 7, label [[SW_BB]]
+; CHECK-NEXT:      i32 6, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 7, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i1 true
@@ -790,8 +790,8 @@ define i32 @test18(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i32 93
@@ -817,8 +817,8 @@ define i8 @test19(i8 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i8 [[A]], label [[DISPATCH]] [
-; CHECK-NEXT:    i8 93, label [[TARGET93:%.*]]
-; CHECK-NEXT:    i8 -111, label [[DISPATCH]]
+; CHECK-NEXT:      i8 93, label [[TARGET93:%.*]]
+; CHECK-NEXT:      i8 -111, label [[DISPATCH]]
 ; CHECK-NEXT:    ]
 ; CHECK:       target93:
 ; CHECK-NEXT:    ret i8 96
@@ -846,8 +846,8 @@ define i1 @test20(i64 %a) {
 ; CHECK-NEXT:    br label [[DISPATCH:%.*]]
 ; CHECK:       dispatch:
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i64 0, label [[EXIT2:%.*]]
-; CHECK-NEXT:    i64 -2147483647, label [[EXIT2]]
+; CHECK-NEXT:      i64 0, label [[EXIT2:%.*]]
+; CHECK-NEXT:      i64 -2147483647, label [[EXIT2]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[B]], 0
@@ -1123,6 +1123,70 @@ else:
   ret i1 true
 }
 
+define i1 @icmp_eq_range_attr(i8 range(i8 1, 0) %i) {
+; CHECK-LABEL: @icmp_eq_range_attr(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_attr(i8 range(i8 -1, 1) %i) {
+; CHECK-LABEL: @neg_icmp_eq_range_attr(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare range(i8 1, 0) i8 @returns_non_zero_range_helper()
+declare range(i8 -1, 1) i8 @returns_contain_zero_range_helper()
+
+define i1 @icmp_eq_range_return() {
+; CHECK-LABEL: @icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_non_zero_range_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call i8 @returns_non_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_return() {
+; CHECK-LABEL: @neg_icmp_eq_range_return(
+; CHECK-NEXT:    [[I:%.*]] = call i8 @returns_contain_zero_range_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call i8 @returns_contain_zero_range_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+declare i8 @returns_i8_helper()
+
+define i1 @icmp_eq_range_call() {
+; CHECK-LABEL: @icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 1, 0) i8 @returns_i8_helper()
+; CHECK-NEXT:    ret i1 false
+;
+  %i = call range(i8 1, 0) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
+define i1 @neg_icmp_eq_range_call() {
+; CHECK-LABEL: @neg_icmp_eq_range_call(
+; CHECK-NEXT:    [[I:%.*]] = call range(i8 0, 11) i8 @returns_i8_helper()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[I]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %i = call range(i8 0, 11) i8 @returns_i8_helper()
+  %cmp = icmp eq i8 %i, 0
+  ret i1 %cmp
+}
+
 declare i16 @llvm.ctlz.i16(i16, i1)
 declare i16 @llvm.cttz.i16(i16, i1)
 declare i16 @llvm.ctpop.i16(i16)
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
index 75130c2..e058c5b 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
@@ -176,3 +176,80 @@ define i129 @fp128tosi129(fp128 %a) {
   %conv = fptosi fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattosi129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattosi129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptosi <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
index ed630d7..c699f80 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
@@ -176,3 +176,80 @@ define i129 @fp128toui129(fp128 %a) {
   %conv = fptoui fp128 %a to i129
   ret i129 %conv
 }
+
+define <2 x i129> @floattoui129v2(<2 x float> %a) {
+; CHECK-LABEL: @floattoui129v2(
+; CHECK-NEXT:  fp-to-i-entryfp-to-i-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i129 [[TMP2]], 23
+; CHECK-NEXT:    [[TMP6:%.*]] = and i129 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i129 [[TMP2]], 8388607
+; CHECK-NEXT:    [[TMP8:%.*]] = or i129 [[TMP7]], 8388608
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i129 [[TMP6]], 127
+; CHECK-NEXT:    br i1 [[TMP9]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_END2:%.*]]
+; CHECK:       fp-to-i-if-end2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i129 [[TMP6]], -256
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i129 [[TMP10]], -129
+; CHECK-NEXT:    br i1 [[TMP11]], label [[FP_TO_I_IF_THEN53:%.*]], label [[FP_TO_I_IF_END94:%.*]]
+; CHECK:       fp-to-i-if-then53:
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP3]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-end94:
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i129 [[TMP6]], 150
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FP_TO_I_IF_THEN125:%.*]], label [[FP_TO_I_IF_ELSE6:%.*]]
+; CHECK:       fp-to-i-if-then125:
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i129 150, [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr i129 [[TMP8]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i129 [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-if-else6:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i129 [[TMP6]], -150
+; CHECK-NEXT:    [[TMP18:%.*]] = shl i129 [[TMP8]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i129 [[TMP18]], [[TMP4]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP1]]
+; CHECK:       fp-to-i-cleanup1:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i129 [ [[TMP12]], [[FP_TO_I_IF_THEN53]] ], [ [[TMP16]], [[FP_TO_I_IF_THEN125]] ], [ [[TMP19]], [[FP_TO_I_IF_ELSE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i129> poison, i129 [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[A]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP23]], -1
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i129 1, i129 -1
+; CHECK-NEXT:    [[TMP27:%.*]] = lshr i129 [[TMP24]], 23
+; CHECK-NEXT:    [[TMP28:%.*]] = and i129 [[TMP27]], 255
+; CHECK-NEXT:    [[TMP29:%.*]] = and i129 [[TMP24]], 8388607
+; CHECK-NEXT:    [[TMP30:%.*]] = or i129 [[TMP29]], 8388608
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i129 [[TMP28]], 127
+; CHECK-NEXT:    br i1 [[TMP31]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_END:%.*]]
+; CHECK:       fp-to-i-if-end:
+; CHECK-NEXT:    [[TMP32:%.*]] = add i129 [[TMP28]], -256
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult i129 [[TMP32]], -129
+; CHECK-NEXT:    br i1 [[TMP33]], label [[FP_TO_I_IF_THEN5:%.*]], label [[FP_TO_I_IF_END9:%.*]]
+; CHECK:       fp-to-i-if-then5:
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP25]], i129 340282366920938463463374607431768211455, i129 -340282366920938463463374607431768211456
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-end9:
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ult i129 [[TMP28]], 150
+; CHECK-NEXT:    br i1 [[TMP35]], label [[FP_TO_I_IF_THEN12:%.*]], label [[FP_TO_I_IF_ELSE:%.*]]
+; CHECK:       fp-to-i-if-then12:
+; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 150, [[TMP28]]
+; CHECK-NEXT:    [[TMP37:%.*]] = lshr i129 [[TMP30]], [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = mul i129 [[TMP37]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-if-else:
+; CHECK-NEXT:    [[TMP39:%.*]] = add i129 [[TMP28]], -150
+; CHECK-NEXT:    [[TMP40:%.*]] = shl i129 [[TMP30]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i129 [[TMP40]], [[TMP26]]
+; CHECK-NEXT:    br label [[FP_TO_I_CLEANUP]]
+; CHECK:       fp-to-i-cleanup:
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i129 [ [[TMP34]], [[FP_TO_I_IF_THEN5]] ], [ [[TMP38]], [[FP_TO_I_IF_THEN12]] ], [ [[TMP41]], [[FP_TO_I_IF_ELSE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i129> [[TMP21]], i129 [[TMP42]], i64 1
+; CHECK-NEXT:    ret <2 x i129> [[TMP43]]
+;
+  %conv = fptoui <2 x float> %a to <2 x i129>
+  ret <2 x i129> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
index 76f5248..f70ce2f 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @si129tofp128(i129 %a) {
   %conv = sitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @si129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @si129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP4]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP4]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = ashr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = ashr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP4]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP59]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP59]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP59]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP59]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP59]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = ashr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = ashr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP59]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP105]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = sitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
index 96d87a5..ee54d53 100644
--- a/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
@@ -426,3 +426,166 @@ define fp128 @ui129tofp128(i129 %a) {
   %conv = uitofp i129 %a to fp128
   ret fp128 %conv
 }
+
+define <2 x float> @ui129tofloatv2(<2 x i129> %a) {
+; CHECK-LABEL: @ui129tofloatv2(
+; CHECK-NEXT:  itofp-entryitofp-entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i129 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
+; CHECK:       itofp-if-end2:
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 129, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 128, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
+; CHECK:       itofp-if-then43:
+; CHECK-NEXT:    switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB4:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb4:
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i129 [[TMP0]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-default5:
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 103, [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
+; CHECK-NEXT:    [[TMP13:%.*]] = lshr i129 [[TMP0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 26
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
+; CHECK-NEXT:    [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
+; CHECK-NEXT:    [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG6]]
+; CHECK:       itofp-sw-epilog6:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP0]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
+; CHECK-NEXT:    [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i129 [[TMP26]], 1
+; CHECK-NEXT:    [[TMP28:%.*]] = lshr i129 [[TMP27]], 2
+; CHECK-NEXT:    [[A310:%.*]] = and i129 [[TMP27]], 67108864
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
+; CHECK-NEXT:    [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
+; CHECK-NEXT:    br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
+; CHECK:       itofp-if-then207:
+; CHECK-NEXT:    [[TMP33:%.*]] = lshr i129 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-else8:
+; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP6]], -105
+; CHECK-NEXT:    [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
+; CHECK-NEXT:    [[TMP39:%.*]] = shl i129 [[TMP0]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
+; CHECK-NEXT:    [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END269]]
+; CHECK:       itofp-if-end269:
+; CHECK-NEXT:    [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
+; CHECK-NEXT:    [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
+; CHECK-NEXT:    [[TMP47:%.*]] = shl i32 [[TMP44]], 23
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
+; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
+; CHECK-NEXT:    [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN1]]
+; CHECK:       itofp-return1:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i129 [[TMP55]], 0
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
+; CHECK:       itofp-if-end:
+; CHECK-NEXT:    [[TMP57:%.*]] = ashr i129 [[TMP55]], 128
+; CHECK-NEXT:    [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP55]], i1 true)
+; CHECK-NEXT:    [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
+; CHECK-NEXT:    [[TMP62:%.*]] = sub i32 129, [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 128, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
+; CHECK-NEXT:    br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
+; CHECK:       itofp-if-then4:
+; CHECK-NEXT:    switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
+; CHECK-NEXT:      i32 25, label [[ITOFP_SW_BB:%.*]]
+; CHECK-NEXT:      i32 26, label [[ITOFP_SW_EPILOG:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       itofp-sw-bb:
+; CHECK-NEXT:    [[TMP65:%.*]] = shl i129 [[TMP55]], 1
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-default:
+; CHECK-NEXT:    [[TMP66:%.*]] = sub i32 103, [[TMP61]]
+; CHECK-NEXT:    [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
+; CHECK-NEXT:    [[TMP68:%.*]] = lshr i129 [[TMP55]], [[TMP67]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add i32 [[TMP61]], 26
+; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
+; CHECK-NEXT:    [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP55]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
+; CHECK-NEXT:    [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
+; CHECK-NEXT:    br label [[ITOFP_SW_EPILOG]]
+; CHECK:       itofp-sw-epilog:
+; CHECK-NEXT:    [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP55]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
+; CHECK-NEXT:    [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
+; CHECK-NEXT:    [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
+; CHECK-NEXT:    [[TMP79:%.*]] = and i32 [[TMP78]], 1
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
+; CHECK-NEXT:    [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i129 [[TMP81]], 1
+; CHECK-NEXT:    [[TMP83:%.*]] = lshr i129 [[TMP82]], 2
+; CHECK-NEXT:    [[A3:%.*]] = and i129 [[TMP82]], 67108864
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
+; CHECK-NEXT:    [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
+; CHECK-NEXT:    [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
+; CHECK-NEXT:    br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
+; CHECK:       itofp-if-then20:
+; CHECK-NEXT:    [[TMP88:%.*]] = lshr i129 [[TMP82]], 3
+; CHECK-NEXT:    [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
+; CHECK-NEXT:    [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-else:
+; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP61]], -105
+; CHECK-NEXT:    [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
+; CHECK-NEXT:    [[TMP94:%.*]] = shl i129 [[TMP55]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
+; CHECK-NEXT:    [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
+; CHECK-NEXT:    [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
+; CHECK-NEXT:    br label [[ITOFP_IF_END26]]
+; CHECK:       itofp-if-end26:
+; CHECK-NEXT:    [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
+; CHECK-NEXT:    [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
+; CHECK-NEXT:    [[TMP102:%.*]] = shl i32 [[TMP99]], 23
+; CHECK-NEXT:    [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
+; CHECK-NEXT:    [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
+; CHECK-NEXT:    [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
+; CHECK-NEXT:    [[TMP106:%.*]] = or i32 [[TMP104]], [[TMP103]]
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
+; CHECK-NEXT:    br label [[ITOFP_RETURN]]
+; CHECK:       itofp-return:
+; CHECK-NEXT:    [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT:    ret <2 x float> [[TMP109]]
+;
+  %conv = uitofp <2 x i129> %a to <2 x float>
+  ret <2 x float> %conv
+}
diff --git a/llvm/test/Transforms/InstCombine/implies.ll b/llvm/test/Transforms/InstCombine/implies.ll
new file mode 100644
index 0000000..c02d84d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/implies.ll
@@ -0,0 +1,424 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @or_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, 23
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_implies_sle_fail(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], -34
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[OR]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %x, -34
+  %cond = icmp sle i8 %or, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_distjoint_implies_ule_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_distjoint_implies_ule_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 28
+; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 28
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @or_prove_distjoin_implies_ule(i8 %xx, i8 %y, i1 %other) {
+; CHECK-LABEL: @or_prove_distjoin_implies_ule(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[XX:%.*]], -16
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X]], 10
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x = and i8 %xx, -16
+  %x1 = or i8 %x, 7
+  %x2 = or i8 %x, 10
+
+  %cond = icmp ule i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_distjoint_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_or_distjoint_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = or disjoint i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp slt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = or disjoint i8 [[X]], 23
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = or disjoint i8 %x, 23
+  %x2 = or disjoint i8 %x, 24
+
+  %cond = icmp sle i8 %y, %x2
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 24
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 23
+  %x2 = add nsw i8 %x, 24
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_addnsw_implies_sle_fail(i8 %x, i8 %y, i1 %other) {
+; CHECK-LABEL: @src_addnsw_implies_sle_fail(
+; CHECK-NEXT:    [[X2:%.*]] = add nsw i8 [[X:%.*]], 23
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X2]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[X1:%.*]] = add nsw i8 [[X]], 24
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i8 [[X1]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %x1 = add nsw i8 %x, 24
+  %x2 = add nsw i8 %x, 23
+
+  %cond = icmp sle i8 %x2, %y
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x1, %y
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ult i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %z, %x
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_ult_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_ult_fail(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %z
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_and_implies_slt_fail(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_and_implies_slt_fail(
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i8 [[AND]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp slt i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %and = and i8 %x, %y
+  %r = icmp slt i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_ule(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %or = or i8 %y, %x
+  %cond = icmp uge i8 %z, %or
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_or_implies_false_ugt_todo(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_or_implies_false_ugt_todo(
+; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[OR]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[X]], [[Z]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %or = or i8 %x, %y
+  %cond = icmp ugt i8 %or, %z
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %r = icmp ugt i8 %x, %z
+  ret i1 %r
+
+}
+
+define i1 @src_udiv_implies_ult(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult(
+; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ugt i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_udiv_implies_ult2(i8 %x, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_udiv_implies_ult2(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[Z:%.*]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 true
+;
+  %cond = icmp ule i8 %z, %x
+  br i1 %cond, label %T, label %F
+T:
+  ret i1 %other
+F:
+  %and = udiv i8 %x, 3
+  %r = icmp ult i8 %and, %z
+  ret i1 %r
+}
+
+define i1 @src_smin_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smin_implies_sle(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp sle i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.smin.i8(i8 %x, i8 %y)
+  %r = icmp sle i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umin_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umin_implies_ule(
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[X:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %cond = icmp ule i8 %x, %z
+  br i1 %cond, label %T, label %F
+T:
+  %um = call i8 @llvm.umin.i8(i8 %x, i8 %y)
+  %r = icmp ule i8 %um, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_umax_implies_ule(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_umax_implies_ule(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.umax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ugt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.umax.i8(i8 %x, i8 %y)
+  %cond = icmp ule i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp ule i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
+
+define i1 @src_smax_implies_sle(i8 %x, i8 %y, i8 %z, i1 %other) {
+; CHECK-LABEL: @src_smax_implies_sle(
+; CHECK-NEXT:    [[UM:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp sgt i8 [[UM]], [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[COND_NOT]], label [[F:%.*]], label [[T:%.*]]
+; CHECK:       T:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       F:
+; CHECK-NEXT:    ret i1 [[OTHER:%.*]]
+;
+  %um = call i8 @llvm.smax.i8(i8 %x, i8 %y)
+  %cond = icmp sle i8 %um, %z
+  br i1 %cond, label %T, label %F
+T:
+  %r = icmp sle i8 %x, %z
+  ret i1 %r
+F:
+  ret i1 %other
+}
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 5305c78..769f766 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,7 +124,6 @@ exit:
   ret i8 %or2
 }
 
-
 define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-LABEL: @test_cond_and_bothways(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
@@ -181,8 +180,6 @@ exit:
   ret i8 %or2
 }
 
-
-
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3
@@ -343,7 +340,7 @@ exit:
   ret i8 %or2
 }
 
-define i32 @test_icmp_trunc1(i32 %x){
+define i32 @test_icmp_trunc1(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -365,7 +362,7 @@ else:
   ret i32 0
 }
 
-define i32 @test_icmp_trunc_assume(i32 %x){
+define i32 @test_icmp_trunc_assume(i32 %x) {
 ; CHECK-LABEL: @test_icmp_trunc_assume(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Y:%.*]] = trunc i32 [[X:%.*]] to i16
@@ -532,7 +529,106 @@ if.else:
   ret i1 %other
 }
 
+define i8 @and_eq_bits_must_be_set(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 1
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 1
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 11
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 11
+  ret i8 %r
+}
+
+define i8 @and_eq_bits_must_be_set2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @and_eq_bits_must_be_set2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = and i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 123
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 111
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = and i8 %x, %y
+  %cmp = icmp eq i8 %xy, 123
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 111
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
+
+define i8 @or_eq_bits_must_be_unset2(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret i8 0
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 1
+  ret i8 %r
+}
 
+define i8 @or_eq_bits_must_be_unset2_partial_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_eq_bits_must_be_unset2_partial_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[Y]], 4
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp eq i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %y, 7
+  ret i8 %r
+}
+
+define i8 @or_ne_bits_must_be_unset2_fail(i8 %x, i8 %y) {
+; CHECK-LABEL: @or_ne_bits_must_be_unset2_fail(
+; CHECK-NEXT:    [[XY:%.*]] = or i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[XY]], 124
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[X]], 3
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %xy = or i8 %x, %y
+  %cmp = icmp ne i8 %xy, 124
+  call void @llvm.assume(i1 %cmp)
+  %r = and i8 %x, 3
+  ret i8 %r
+}
 
 declare void @use(i1)
 declare void @sink(i8)
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 278cabd..05fcf66 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3693,6 +3693,800 @@ exit:
   ret i32 %rem
 }
 
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/QXQDwF
+; X&Y==C?X|Y:X^Y, X&Y==C?X^Y:X|Y
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==0 could imply no_common_bit to TrueValue
+define i32 @src_and_eq_0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_0_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, 0
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_or_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X&Y==-1 could imply all_common_bit to TrueValue
+define i32 @src_and_eq_neg1_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_and_eq_neg1_xor_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, -1
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %or1 = or i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_or_xorxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_or_xorxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %or = or i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_OrAndNotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_OrAndNotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_and_eq_C_xor_orxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_and_eq_C_xor_orxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %and = and i32 %y, %x
+  %cmp = icmp eq i32 %and, %c
+  %xor = xor i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/9RPwfN
+; X|Y==C?X&Y:X^Y, X|Y==C?X^Y:X&Y
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %cond = select i1 %cmp, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; TODO: X|Y==0 could imply no_common_bit to TrueValue
+define i32 @src_or_eq_0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_0_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, 0
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %cond = select i1 %cmp, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_and_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_neg1_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_or_eq_neg1_xor_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, -1
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %cond = select i1 %cmp, i32 %xor, i32 %not
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_xorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_xorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[XOR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %xor = xor i32 %y, %x
+  %xor1 = xor i32 %xor, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_and_andnotxorC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_and_andnotxorC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP0]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %and = and i32 %y, %x
+  %0 = xor i32 %x, %y
+  %not = xor i32 %0, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_or_eq_C_xor_andnotandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_or_eq_C_xor_andnotandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[AND]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[NOT]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %or = or i32 %y, %x
+  %cmp = icmp eq i32 %or, %c
+  %xor = xor i32 %y, %x
+  %and = and i32 %y, %x
+  %not = xor i32 %and, -1
+  %and1 = and i32 %not, %c
+  %cond = select i1 %cmp, i32 %xor, i32 %and1
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/c6oXi4
+; X^Y==C?X&Y:X|Y, X^Y==C?X|Y:X&Y
+define i32 @src_xor_eq_neg1_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[OR]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %or, -1
+  %cond = select i1 %cmp, i32 %and, i32 %not
+  ret i32 %cond
+}
+
+; TODO: X^Y==-1 could imply no_common_bit to TrueValue
+define i32 @src_xor_eq_neg1_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_xor_eq_neg1_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[OR]], i32 -1
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, -1
+  %or = or i32 %y, %x
+  %cond = select i1 %cmp, i32 %or, i32 -1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_xororC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_xororC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[OR]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %xor1 = xor i32 %or, %c
+  %cond = select i1 %cmp, i32 %and, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_and_andornotC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_and_andornotC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[C:%.*]], -1
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[OR]], [[NOT]]
+; CHECK-NEXT:    ret i32 [[AND1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %and = and i32 %y, %x
+  %or = or i32 %y, %x
+  %not = xor i32 %c, -1
+  %and1 = and i32 %or, %not
+  %cond = select i1 %cmp, i32 %and, i32 %and1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_xorandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_xorandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %xor1 = xor i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %xor1
+  ret i32 %cond
+}
+
+define i32 @src_xor_eq_C_or_orandC(i32 %x, i32 %y, i32 %c) {
+; CHECK-LABEL: @src_xor_eq_C_or_orandC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[AND]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+entry:
+  %xor = xor i32 %y, %x
+  %cmp = icmp eq i32 %xor, %c
+  %or = or i32 %y, %x
+  %and = and i32 %y, %x
+  %or1 = or i32 %and, %c
+  %cond = select i1 %cmp, i32 %or, i32 %or1
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; NO TRANSFORMED - select condition is compare with not 0
+define i32 @src_select_and_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 1
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_max_positive_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 2147483647
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 2147483647
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_and_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_and_min_negative_int(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], -2147483648
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, -2147483648
+  %xor = xor i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_positive_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 2147483647
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_min_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -2147483648
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_or_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_or_max_negative_int(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, -1
+  %and = and i32 %x, %y
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_positive_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_positive_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 2147483647
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 2147483647
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_min_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_min_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -2147483648
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -2147483648
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_select_xor_max_negative_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_select_xor_max_negative_int(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, -1
+  %and = and i32 %x, %y
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+; Select icmp and/or/xor
+; https://alive2.llvm.org/ce/z/BVgrJ-
+; NO TRANSFORMED - not supported
+define i32 @src_no_trans_select_and_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_or(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_and_xor(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_or_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[OR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %and0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_eq0_xor_and(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND0:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[AND0]], i32 [[XOR]], i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %and = and i32 %x, %y
+  %and0 = icmp eq i32 %and, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %and0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_and(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_and_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_and_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[AND]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %or0, i32 %and, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_or_eq0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 [[XOR]], i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp eq i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %or = or i32 %x, %y
+  %or0 = icmp ne i32 %or, 0
+  %xor = xor i32 %x, %y
+  %cond = select i1 %or0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_and(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_and(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[AND]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %and
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_xor_or(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_xor_or(
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 0, i32 [[OR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %xor, i32 %or
+  ret i32 %cond
+}
+
+define i32 @src_no_trans_select_xor_eq0_and_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_and_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[AND]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %and = and i32 %x, %y
+  %cond = select i1 %xor0, i32 %and, i32 %xor
+  ret i32 %cond
+}
+
+; https://alive2.llvm.org/ce/z/SBe8ei
+define i32 @src_no_trans_select_xor_eq0_or_xor(i32 %x, i32 %y) {
+; CHECK-LABEL: @src_no_trans_select_xor_eq0_or_xor(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[XOR0:%.*]] = icmp eq i32 [[XOR]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[XOR0]], i32 [[OR]], i32 [[XOR]]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %xor = xor i32 %x, %y
+  %xor0 = icmp eq i32 %xor, 0
+  %or = or i32 %x, %y
+  %cond = select i1 %xor0, i32 %or, i32 %xor
+  ret i32 %cond
+}
+
 ; (X == C) ? X : Y -> (X == C) ? C : Y
 ; Fixed #77553
 define i32 @src_select_xxory_eq0_xorxy_y(i32 %x, i32 %y) {
diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
index 661c360..a4b74aa 100644
--- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes='instcombine<no-verify-fixpoint>' -S | FileCheck %s
 
 define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-LABEL: @zext_or_icmp_icmp(
@@ -180,11 +180,11 @@ define i8 @PR49475_infloop(i32 %t0, i16 %insert, i64 %e, i8 %i162) {
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[SUB17]], 32
 ; CHECK-NEXT:    [[CONV18:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i64 [[XOR]], [[CONV18]]
-; CHECK-NEXT:    [[CONV19:%.*]] = zext i1 [[CMP]] to i16
-; CHECK-NEXT:    [[OR21:%.*]] = or i16 [[CONV19]], [[INSERT]]
-; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = icmp eq i16 [[OR21]], 0
+; CHECK-NEXT:    [[TRUNC44:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    [[INC:%.*]] = add i8 [[TRUNC44]], [[I162]]
+; CHECK-NEXT:    [[TOBOOL23_NOT:%.*]] = xor i1 [[CMP]], true
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[TOBOOL23_NOT]])
-; CHECK-NEXT:    ret i8 [[I162]]
+; CHECK-NEXT:    ret i8 [[INC]]
 ;
   %b = icmp eq i32 %t0, 0
   %b2 = icmp eq i16 %insert, 0
diff --git a/llvm/test/Transforms/InstSimplify/implies.ll b/llvm/test/Transforms/InstSimplify/implies.ll
index b70dc90..7e3cb65 100644
--- a/llvm/test/Transforms/InstSimplify/implies.ll
+++ b/llvm/test/Transforms/InstSimplify/implies.ll
@@ -155,7 +155,13 @@ define i1 @test9(i32 %length.i, i32 %i) {
 
 define i1 @test10(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -65536
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 100
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 90
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
   %large = or i32 %x, 100
@@ -166,6 +172,19 @@ define i1 @test10(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test10_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test10_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294901760  ;; 4294901760 == 0xffff0000
+  %large = or disjoint i32 %x, 100
+  %small = or disjoint i32 %x, 90
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test11(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X:%.*]], 100
@@ -216,7 +235,13 @@ define i1 @test13(i32 %length.i, i32 %x) {
 
 define i1 @test14(i32 %length.i, i32 %x.full) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[X_FULL:%.*]], -61681
+; CHECK-NEXT:    [[LARGE:%.*]] = or i32 [[X]], 8224
+; CHECK-NEXT:    [[SMALL:%.*]] = or i32 [[X]], 4112
+; CHECK-NEXT:    [[KNOWN:%.*]] = icmp ult i32 [[LARGE]], [[LENGTH_I:%.*]]
+; CHECK-NEXT:    [[TO_PROVE:%.*]] = icmp ult i32 [[SMALL]], [[LENGTH_I]]
+; CHECK-NEXT:    [[RES:%.*]] = icmp ule i1 [[KNOWN]], [[TO_PROVE]]
+; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
   %large = or i32 %x, 8224 ;; == 0x2020
@@ -227,6 +252,19 @@ define i1 @test14(i32 %length.i, i32 %x.full) {
   ret i1 %res
 }
 
+define i1 @test14_with_disjoint(i32 %length.i, i32 %x.full) {
+; CHECK-LABEL: @test14_with_disjoint(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = and i32 %x.full, 4294905615  ;; 4294905615 == 0xffff0f0f
+  %large = or disjoint i32 %x, 8224 ;; == 0x2020
+  %small = or disjoint i32 %x, 4112 ;; == 0x1010
+  %known = icmp ult i32 %large, %length.i
+  %to.prove = icmp ult i32 %small, %length.i
+  %res = icmp ule i1 %known, %to.prove
+  ret i1 %res
+}
+
 define i1 @test15(i32 %length.i, i32 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LARGE:%.*]] = add nuw i32 [[X:%.*]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 3e895edc..afd49aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -43,9 +43,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -135,9 +134,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..2ce2a45
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -S < %s | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..5d1a471
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vplan-force-tail-with-evl.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr10 -disable-output < %s 2>&1 | FileCheck %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in vp<%2> = backedge-taken count
+; CHECK-NEXT: Live-in ir<%N> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%16>
+; CHECK-NEXT:     WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
+; CHECK-NEXT:     EMIT vp<%5> = icmp ule ir<%iv>, vp<%2>
+; CHECK-NEXT:   Successor(s): pred.store
+; CHECK-EMPTY:
+; CHECK-NEXT:  <xVFxUF> pred.store: {
+; CHECK-NEXT:    pred.store.entry:
+; CHECK-NEXT:      BRANCH-ON-MASK vp<%5>
+; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.if:
+; CHECK-NEXT:      vp<%6> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx> = getelementptr inbounds ir<%b>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx2> = getelementptr inbounds ir<%c>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%1> = load ir<%arrayidx2>
+; CHECK-NEXT:      REPLICATE ir<%arrayidx4> = getelementptr inbounds ir<%a>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%add> = add nsw ir<%1>, ir<%0>
+; CHECK-NEXT:      REPLICATE store ir<%add>, ir<%arrayidx4>
+; CHECK-NEXT:    Successor(s): pred.store.continue
+; CHECK-EMPTY:
+; CHECK-NEXT:    pred.store.continue:
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%14> = ir<%0>
+; CHECK-NEXT:      PHI-PREDICATED-INSTRUCTION vp<%15> = ir<%1>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): for.body.2
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body.2:
+; CHECK-NEXT:     EMIT vp<%16> = add vp<%3>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%16>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK-LABEL: VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF * UF
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%10>
+; CHECK-NEXT:     vp<%3> = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%a1> = getelementptr ir<%p>, vp<%3>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%a1>
+; CHECK-NEXT:     WIDEN ir<%v> = load vp<%5>
+; CHECK-NEXT:     CLONE ir<%offset> = add vp<%3>, ir<100>
+; CHECK-NEXT:     CLONE ir<%a2> = getelementptr ir<%p>, ir<%offset>
+; CHECK-NEXT:     vp<%9> = vector-pointer ir<%a2>
+; CHECK-NEXT:     WIDEN store vp<%9>, ir<%v>
+; CHECK-NEXT:     EMIT vp<%10> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%10>, vp<%1>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 57e1dc9..b876e9d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -1,11 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize < %s -S -o - | FileCheck %s -check-prefix=OUTLOOP
 ; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+d -passes=loop-vectorize -prefer-inloop-reductions < %s -S -o - | FileCheck %s -check-prefix=INLOOP
-
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple=riscv64 -mattr=+v -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
+; FIXME: inloop reductions are not supported yet with predicated vectorization.
+
 define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; OUTLOOP-LABEL: @add_i16_i32(
 ; OUTLOOP-NEXT:  entry:
@@ -115,6 +117,70 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; INLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
+; IF-EVL-LABEL: @add_i16_i32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP12]], i32 2, <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i16> poison)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; IF-EVL-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; IF-EVL-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
+; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-NEXT:    ret i32 [[R_0_LCSSA]]
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
new file mode 100644
index 0000000..835ff37
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
+; IF-EVL-LABEL: @gather_scatter(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> [[TMP19]], i32 [[TMP18]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
+; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.end:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @gather_scatter(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; NO-VP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; NO-VP:       for.end:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
+  %0 = load i64, ptr %arrayidx3, align 8
+  %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %0
+  %1 = load float, ptr %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %0
+  store float %1, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
new file mode 100644
index 0000000..0b495bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; FIXME: interleaved accesses are not supported yet with predicated vectorization.
+define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
+; IF-EVL-LABEL: @interleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP31]], 8
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP17]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; IF-EVL-NEXT:    [[TMP32:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP32]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[TMP11]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP13]]
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP15]]
+; IF-EVL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP37]], i64 0
+; IF-EVL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT:    [[TMP38:%.*]] = add i64 [[TMP19]], 0
+; IF-EVL-NEXT:    [[TMP39:%.*]] = mul i64 [[TMP38]], 1
+; IF-EVL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], [[TMP39]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP26]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[STEP_ADD]], i32 1
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP27]], i32 4, <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP28]], i32 4, <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP29:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT:    [[TMP30:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER2]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; IF-EVL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP35]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP29]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP23]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP30]], ptr [[TMP36]], i32 4, <vscale x 4 x i1> [[TMP24]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP21]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @interleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[TMP10]], i32 0
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[TMP1]], i32 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; NO-VP-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
+; NO-VP-NEXT:    [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; NO-VP-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 8
+; NO-VP-NEXT:    store <8 x i32> [[TMP6]], ptr [[TMP12]], align 4
+; NO-VP-NEXT:    store <8 x i32> [[TMP7]], ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; NO-VP-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP30]], [[TMP29]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
new file mode 100644
index 0000000..d5ad99f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
+; IF-EVL-LABEL: @iv32(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP19:%.*]] = sub i32 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP19]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i32 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP_LOAD]], ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i32 [[TMP12]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV1]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV1]]
+; IF-EVL-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT1]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @iv32(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], [[TMP10]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP11]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; NO-VP-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]]
+; NO-VP-NEXT:    store i32 [[TMP9]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i32 %iv
+  store i32 %0, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
new file mode 100644
index 0000000..203d0c9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
+; IF-EVL-LABEL: @masked_loadstore(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; IF-EVL:       if.then:
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    br label [[FOR_INC]]
+; IF-EVL:       for.inc:
+; IF-EVL-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       exit:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @masked_loadstore(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP0]], 0
+; NO-VP-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; NO-VP:       if.then:
+; NO-VP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[I_011]]
+; NO-VP-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
+; NO-VP-NEXT:    br label [[FOR_INC]]
+; NO-VP:       for.inc:
+; NO-VP-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; NO-VP:       exit:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i.011 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.011
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp ne i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx3 = getelementptr inbounds i32, ptr %a, i64 %i.011
+  %1 = load i32, ptr %arrayidx3, align 4
+  %add = add i32 %0, %1
+  store i32 %add, ptr %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i64 %i.011, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
new file mode 100644
index 0000000..1c49fba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-no-masking.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; No need to emit predicated vector code if the vector instructions with masking are not required.
+define i32 @no_masking() {
+; CHECK-LABEL: @no_masking(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BODY]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[P]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %body
+
+body:
+  %p = phi i32 [ 1, %entry ], [ %inc, %body ]
+  %inc = add i32 %p, 1
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %end, label %body
+
+end:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
new file mode 100644
index 0000000..f2222e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+; FIXME: reversed loads/stores are not supported yet with predicated vectorization.
+define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %ptr2) {
+; IF-EVL-LABEL: @reverse_load_store(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
+; IF-EVL-NEXT:    [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
+; IF-EVL-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 0, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
+; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
+; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
+; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
+; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
+; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
+; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       loopend:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @reverse_load_store(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
+; NO-VP-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
+; NO-VP-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[ADD]]
+; NO-VP-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
+; NO-VP-NEXT:    [[INC]] = add i32 [[I]], 1
+; NO-VP-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
+; NO-VP-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND:%.*]]
+; NO-VP:       loopend:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %add.phi = phi i64 [ %startval, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %add = add i64 %add.phi, -1
+  %gepl = getelementptr inbounds i32, ptr %ptr, i64 %add
+  %tmp = load i32, ptr %gepl, align 4
+  %geps = getelementptr inbounds i32, ptr %ptr2, i64 %add
+  store i32 %tmp, ptr %geps, align 4
+  %inc = add i32 %i, 1
+  %exitcond = icmp ne i32 %inc, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
new file mode 100644
index 0000000..c69bb17
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP13]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..72b881b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -0,0 +1,134 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=IF-EVL,CHECK %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP,CHECK %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[EVL_PHI]]>, ir<%N>
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, ir<true>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, ir<true>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, ir<true>
+; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @safe_dep(ptr %p) {
+; CHECK: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<512> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:  vector.body:
+; CHECK-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; CHECK-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr ir<%p>, vp<[[ST]]>
+; CHECK-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; CHECK-NEXT:    WIDEN ir<[[V:%.+]]> = load vp<[[PTR1]]>
+; CHECK-NEXT:    CLONE ir<[[OFFSET:.+]]> = add vp<[[ST]]>, ir<100>
+; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
+; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
+; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:  No successors
+; CHECK-NEXT: }
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %a1 = getelementptr i64, ptr %p, i64 %iv
+  %v = load i64, ptr %a1, align 32
+  %offset = add i64 %iv, 100
+  %a2 = getelementptr i64, ptr %p, i64 %offset
+  store i64 %v, ptr %a2, align 32
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ne i64 %iv, 511
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..1cf71360
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 15
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; NO-VP:       vector.main.loop.iter.check:
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 64
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 32
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i32>, ptr [[TMP9]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i32>, ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i32>, ptr [[TMP11]], align 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 16
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 32
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 48
+; NO-VP-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP16]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP18]], align 4
+; NO-VP-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP19]], align 4
+; NO-VP-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
+; NO-VP-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
+; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD4]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]]
+; NO-VP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]]
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
+; NO-VP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 16
+; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 32
+; NO-VP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 48
+; NO-VP-NEXT:    store <16 x i32> [[TMP20]], ptr [[TMP28]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP21]], ptr [[TMP29]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP22]], ptr [[TMP30]], align 4
+; NO-VP-NEXT:    store <16 x i32> [[TMP23]], ptr [[TMP31]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; NO-VP-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; NO-VP:       vec.epilog.iter.check:
+; NO-VP-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP:       vec.epilog.ph:
+; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
+; NO-VP-NEXT:    [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]]
+; NO-VP-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; NO-VP:       vec.epilog.vector.body:
+; NO-VP-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX12]], 0
+; NO-VP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP35]], align 4
+; NO-VP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4
+; NO-VP-NEXT:    [[TMP38:%.*]] = add nsw <8 x i32> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
+; NO-VP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP33]]
+; NO-VP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 0
+; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP40]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
+; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       vec.epilog.middle.block:
+; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
+; NO-VP:       vec.epilog.scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP43]], [[TMP42]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
new file mode 100644
index 0000000..9b49d44
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -0,0 +1,89 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output < %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]>, vp<[[BETC]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in ir<%N> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load vp<[[PTR2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
new file mode 100644
index 0000000..0b87270
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
+; CHECK-LABEL: define i32 @any_of_reduction_epilog(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI6]]
+; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %select, %loop ]
+  %gep = getelementptr inbounds i8, ptr %src, i64 %iv
+  %load = load i8, ptr %gep, align 1
+  %icmp = icmp eq i8 %load, 0
+  %select = select i1 %icmp, i32 1, i32 %red
+  %iv.next = add i64 %iv, 1
+  %icmp3 = icmp eq i64 %iv, %N
+  br i1 %icmp3, label %exit, label %loop
+
+exit:
+  ret i32 %select
+}
+
+
+define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
+; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog(
+; CHECK-SAME: i64 [[N:%.*]], i32 [[A:%.*]]) {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END6:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]]
+; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]])
+; CHECK-NEXT:    [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false
+; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RED_I1:%.*]] = phi i1 [ [[BC_MERGE_RDX17]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[IV_2]], [[A]]
+; CHECK-NEXT:    [[SEL]] = select i1 [[CMP_1]], i1 [[RED_I1]], i1 false
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
+; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[SEL_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.i1 = phi i1 [ false, %entry ], [ %sel, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %cmp.1 = icmp eq i32 %iv.2, %a
+  %sel = select i1 %cmp.1, i1 %red.i1, i1 false
+  %iv.next = add i64 %iv, 1
+  %iv.2.next = add i32 %iv.2, 1
+  %cmp.2 = icmp eq i64 %iv, %N
+  br i1 %cmp.2, label %exit, label %loop
+
+exit:
+  ret i1 %sel
+
+; uselistorder directives
+  uselistorder i1 %sel, { 1, 0 }
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
new file mode 100644
index 0000000..a90b38c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP8]]
+; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP1]], 4
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP14]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
+; NO-VP-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP10]], align 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
new file mode 100644
index 0000000..f510d47
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-force-tail-with-evl.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl -force-vector-width=4 \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on \
+; RUN: -disable-output < %s 2>&1 | FileCheck --check-prefixes=NO-VP %s
+
+; The target does not support predicated vectorization.
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; NO-VP-NOT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/PGOProfile/vtable_profile.ll b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
index a844003..aae1e2d 100644
--- a/llvm/test/Transforms/PGOProfile/vtable_profile.ll
+++ b/llvm/test/Transforms/PGOProfile/vtable_profile.ll
@@ -1,9 +1,6 @@
 ; RUN: opt < %s -passes=pgo-instr-gen -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=GEN --implicit-check-not="VTable value profiling is presently not supported"
 ; RUN: opt < %s -passes=pgo-instr-gen,instrprof -enable-vtable-value-profiling -S 2>&1 | FileCheck %s --check-prefix=LOWER --implicit-check-not="VTable value profiling is presently not supported"
 
-; __llvm_prf_vnm stores zlib-compressed vtable names.
-; REQUIRES: zlib
-
 source_filename = "vtable_local.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -59,7 +56,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; LOWER: $"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = comdat nodeduplicate
 ; LOWER: @__profvt__ZTV7Derived = global { i64, ptr, i32 } { i64 -4576307468236080025, ptr @_ZTV7Derived, i32 48 }, section "__llvm_prf_vtab", comdat, align 8
 ; LOWER: @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E" = internal global { i64, ptr, i32 } { i64 1419990121885302679, ptr @_ZTVN12_GLOBAL__N_15Base2E, i32 24 }, section "__llvm_prf_vtab", comdat, align 8
-; LOWER: @__llvm_prf_vnm = private constant [64 x i8] c"7>x\DA\8B\8F\0A\093wI-\CA,KMa,+IL\CAI\8D\CF\C9ON\CC\D1\CB\C9\B1\8E\07J\FA\19\1A\C5\BB\FB\F8;9\FA\C4\C7\FB\C5\1B\9A:%\16\A7\1A\B9\02\00\19:\12o", section "__llvm_prf_vns", align 1
+; LOWER: @__llvm_prf_vnm = private constant {{.*}}, section "__llvm_prf_vns", align 1
 ; LOWER: @llvm.used = appending global [5 x ptr] [ptr @__profvt__ZTV7Derived, ptr @"__profvt_vtable_local.ll;_ZTVN12_GLOBAL__N_15Base2E", ptr @__llvm_prf_vnodes, ptr @__llvm_prf_nm, ptr @__llvm_prf_vnm], section "llvm.metadata"
 
 define i32 @_Z4funci(i32 %a) {
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index 495ec0a..45e411d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -9,11 +9,7 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
diff --git a/llvm/test/Transforms/RemoveTraps/remove-traps.ll b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
index 80b86e0..c8d5fec 100644
--- a/llvm/test/Transforms/RemoveTraps/remove-traps.ll
+++ b/llvm/test/Transforms/RemoveTraps/remove-traps.ll
@@ -1,18 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes='function(remove-traps)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(remove-traps)' -remove-traps-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -S | FileCheck %s --check-prefixes=HOT
+; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(remove-traps)' -remove-traps-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
 
 target triple = "x86_64-pc-linux-gnu"
 
 declare void @llvm.ubsantrap(i8 immarg)
+declare i1 @llvm.allow.ubsan.check(i8 immarg)
 
 define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @simple(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -24,7 +26,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-LABEL: define dso_local noundef i32 @simple(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -33,22 +36,24 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @simple(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @simple(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @simple(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -58,7 +63,8 @@ define dso_local noundef i32 @simple(ptr noundef readonly %0) {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -76,7 +82,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @hot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -88,7 +95,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-LABEL: define dso_local noundef i32 @hot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -97,22 +105,24 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @hot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @hot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @hot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF16:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -122,7 +132,8 @@ define dso_local noundef i32 @hot(ptr noundef readonly %0) !prof !36 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -139,7 +150,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; NOPROFILE-LABEL: define dso_local noundef i32 @veryHot(
 ; NOPROFILE-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; NOPROFILE-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NOPROFILE:       3:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -151,7 +163,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-LABEL: define dso_local noundef i32 @veryHot(
 ; ALL-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; ALL-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; ALL:       3:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -160,22 +173,24 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; ALL-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
 ; ALL-NEXT:    ret i32 [[TMP5]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @veryHot(
-; HOT-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
-; HOT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
-; HOT-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
-; HOT:       3:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
-; HOT-NEXT:    ret i32 [[TMP5]]
+; HOT99-LABEL: define dso_local noundef i32 @veryHot(
+; HOT99-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
+; HOT99-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; HOT99:       3:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; HOT99-NEXT:    ret i32 [[TMP5]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @veryHot(
 ; HOT70-SAME: ptr noundef readonly [[TMP0:%.*]]) !prof [[PROF17:![0-9]+]] {
 ; HOT70-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], true
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT70-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP6]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; HOT70:       3:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -185,7 +200,8 @@ define dso_local noundef i32 @veryHot(ptr noundef readonly %0) !prof !39 {
 ; HOT70-NEXT:    ret i32 [[TMP5]]
 ;
   %chk = icmp eq ptr %0, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %2 = or i1 %chk, %hot
   br i1 %2, label %3, label %4
 
@@ -206,7 +222,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -224,7 +241,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -236,23 +254,24 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchColdFnHot(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchColdFnHot(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchColdFnHot(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF17]] {
@@ -260,7 +279,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF18:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -277,7 +297,8 @@ define dso_local noundef i32 @branchColdFnHot(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -301,7 +322,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; NOPROFILE-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; NOPROFILE:       4:
 ; NOPROFILE-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; NOPROFILE-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; NOPROFILE-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; NOPROFILE-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; NOPROFILE:       6:
 ; NOPROFILE-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -319,7 +341,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; ALL:       4:
 ; ALL-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
+; ALL-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; ALL-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; ALL-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; ALL:       6:
 ; ALL-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -331,23 +354,24 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
 ; ALL-NEXT:    ret i32 [[TMP10]]
 ;
-; HOT-LABEL: define dso_local noundef i32 @branchHotFnCold(
-; HOT-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
-; HOT-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; HOT-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
-; HOT:       4:
-; HOT-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], true
-; HOT-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; HOT:       6:
-; HOT-NEXT:    tail call void @llvm.ubsantrap(i8 22)
-; HOT-NEXT:    unreachable
-; HOT:       7:
-; HOT-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
-; HOT-NEXT:    br label [[TMP9]]
-; HOT:       9:
-; HOT-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
-; HOT-NEXT:    ret i32 [[TMP10]]
+; HOT99-LABEL: define dso_local noundef i32 @branchHotFnCold(
+; HOT99-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
+; HOT99-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; HOT99-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
+; HOT99:       4:
+; HOT99-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
+; HOT99-NEXT:    [[HOT:%.*]] = xor i1 false, true
+; HOT99-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
+; HOT99-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; HOT99:       6:
+; HOT99-NEXT:    tail call void @llvm.ubsantrap(i8 22)
+; HOT99-NEXT:    unreachable
+; HOT99:       7:
+; HOT99-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; HOT99-NEXT:    br label [[TMP9]]
+; HOT99:       9:
+; HOT99-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ 0, [[TMP2:%.*]] ]
+; HOT99-NEXT:    ret i32 [[TMP10]]
 ;
 ; HOT70-LABEL: define dso_local noundef i32 @branchHotFnCold(
 ; HOT70-SAME: i32 noundef [[TMP0:%.*]], ptr noundef readonly [[TMP1:%.*]]) !prof [[PROF16]] {
@@ -355,7 +379,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; HOT70-NEXT:    br i1 [[TMP3]], label [[TMP9:%.*]], label [[TMP4:%.*]], !prof [[PROF19:![0-9]+]]
 ; HOT70:       4:
 ; HOT70-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[TMP1]], null
-; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], false
+; HOT70-NEXT:    [[HOT:%.*]] = xor i1 true, true
+; HOT70-NEXT:    [[TMP11:%.*]] = or i1 [[TMP5]], [[HOT]]
 ; HOT70-NEXT:    br i1 [[TMP11]], label [[TMP6:%.*]], label [[TMP7:%.*]]
 ; HOT70:       6:
 ; HOT70-NEXT:    tail call void @llvm.ubsantrap(i8 22)
@@ -372,7 +397,8 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 
 4:
   %chk = icmp eq ptr %1, null
-  %hot = call i1 @llvm.experimental.hot()
+  %allow = call i1 @llvm.allow.ubsan.check(i8 22)
+  %hot = xor i1 %allow, true
   %5 = or i1 %chk, %hot
   br i1 %5, label %6, label %7
 
@@ -424,10 +450,10 @@ define dso_local noundef i32 @branchHotFnCold(i32 noundef %0, ptr noundef readon
 ; ALL: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
 ; ALL: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
-; HOT: [[PROF16]] = !{!"function_entry_count", i64 1000}
-; HOT: [[PROF17]] = !{!"function_entry_count", i64 7000}
-; HOT: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
-; HOT: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
+; HOT99: [[PROF16]] = !{!"function_entry_count", i64 1000}
+; HOT99: [[PROF17]] = !{!"function_entry_count", i64 7000}
+; HOT99: [[PROF18]] = !{!"branch_weights", i32 1000, i32 1}
+; HOT99: [[PROF19]] = !{!"branch_weights", i32 1, i32 1000}
 ;.
 ; HOT70: [[PROF16]] = !{!"function_entry_count", i64 1000}
 ; HOT70: [[PROF17]] = !{!"function_entry_count", i64 7000}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 89ea15d..e492596 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-LABEL: define void @gather_2(
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
-; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
-; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
 ; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
 ; CHECK-NEXT:    [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    store float [[TMP1]], ptr [[BEZT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
 ; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
-; CHECK-NEXT:    store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
-; CHECK-NEXT:    store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
new file mode 100644
index 0000000..979d0ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/ext-alt-node-must-ext.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=systemz-unknown -mcpu=z15 < %s -slp-threshold=-10 | FileCheck %s
+
+define i32 @test(ptr %0, ptr %1) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 32 to ptr), align 32
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i1> [[TMP9]] to <2 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i8> [[TMP16]], <2 x i8> [[TMP11]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i8> [[TMP12]], i32 0
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i8 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i8> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 [[DOTNEG]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %3 = load i64, ptr inttoptr (i64 32 to ptr), align 32
+  %4 = load ptr, ptr %1, align 8
+  %5 = getelementptr inbounds i8, ptr %4, i64 32
+  %6 = load i64, ptr %5, align 8
+  %7 = icmp ne i64 %3, 0
+  %8 = zext i1 %7 to i32
+  %9 = icmp ne i64 %6, 0
+  %.neg = sext i1 %9 to i32
+  %10 = add nsw i32 %.neg, %8
+  ret i32 %10
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
new file mode 100644
index 0000000..84f7e21
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external-user-instruction-minbitwidth.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@e = global i8 0
+@c = global i16 0
+@d = global i32 0
+
+define i8 @test() {
+; CHECK-LABEL: define i8 @test() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @e, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @c, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32769>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CONV4_30:%.*]] = trunc i32 [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; CHECK-NEXT:    [[XOR_31:%.*]] = and i32 [[TMP13]], -2
+; CHECK-NEXT:    store i32 [[XOR_31]], ptr @d, align 4
+; CHECK-NEXT:    ret i8 [[CONV4_30]]
+;
+entry:
+  %0 = load i8, ptr @e, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i16, ptr @c, align 2
+  %conv1 = zext i16 %1 to i32
+  %or.16 = or i32 %conv, 1
+  %add.16 = add nsw i32 %or.16, %conv1
+  %or.18 = or i32 %conv, 1
+  %add.18 = add nsw i32 %or.18, %conv1
+  %conv4.181 = or i32 %add.16, %add.18
+  %or.20 = or i32 %conv, 1
+  %add.20 = add nsw i32 %or.20, %conv1
+  %conv4.202 = or i32 %conv4.181, %add.20
+  %or.22 = or i32 %conv, 1
+  %add.22 = add nsw i32 %or.22, %conv1
+  %conv4.223 = or i32 %conv4.202, %add.22
+  %or.24 = or i32 %conv, 1
+  %add.24 = add nsw i32 %or.24, %conv1
+  %conv4.244 = or i32 %conv4.223, %add.24
+  %or.26 = or i32 %conv, 1
+  %add.26 = add nsw i32 %or.26, %conv1
+  %conv4.265 = or i32 %conv4.244, %add.26
+  %or.28 = or i32 %conv, 1
+  %add.28 = add nsw i32 %or.28, %conv1
+  %conv4.286 = or i32 %conv4.265, %add.28
+  %or.30 = or i32 %conv, 32769
+  %add.30 = add nsw i32 %or.30, %conv1
+  %conv4.307 = or i32 %conv4.286, %add.30
+  %conv4.30 = trunc i32 %conv4.307 to i8
+  %xor.31 = and i32 %or.30, -2
+  store i32 %xor.31, ptr @d, align 4
+  ret i8 %conv4.30
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 66e3fbf..4cc3c12 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
 
 define void @PR49730() {
 ; CHECK-LABEL: @PR49730(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
 ; CHECK-NEXT:    [[T12:%.*]] = sub nsw i32 undef, undef
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
new file mode 100644
index 0000000..6b27015
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %sptr, i64 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[SPTR:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[IV2:%.*]] = getelementptr i8, ptr [[SPTR]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[IV2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp sle <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[AND33:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    ret i32 [[AND33]]
+;
+entry:
+  %conv.i = trunc i64 %0 to i32
+  %iv2 = getelementptr i8, ptr %sptr, i64 4
+  %1 = load i32, ptr %iv2, align 4
+  %cmp11 = icmp slt i32 %1, %conv.i
+  %cmp.i57 = icmp eq i32 %1, 0
+  %or.i5977 = or i1 %cmp.i57, %cmp11
+  %iv4 = getelementptr i8, ptr %sptr, i64 12
+  %2 = load i32, ptr %iv4, align 4
+  %cmp16 = icmp sle i32 %2, %conv.i
+  %cmp.i62 = icmp eq i32 %2, 0
+  %or.i6478 = or i1 %cmp.i62, %cmp16
+  %iv3 = getelementptr i8, ptr %sptr, i64 8
+  %3 = load i32, ptr %iv3, align 8
+  %cmp21 = icmp sgt i32 %3, %conv.i
+  %cmp.i67 = icmp eq i32 %3, 0
+  %or.i6979 = or i1 %cmp.i67, %cmp21
+  %iv5 = getelementptr i8, ptr %sptr, i64 16
+  %4 = load i32, ptr %iv5, align 8
+  %cmp26 = icmp slt i32 %conv.i, 0
+  %cmp.i72 = icmp eq i32 %4, 0
+  %or.i7480 = or i1 %cmp.i72, %cmp26
+  %and3183 = and i1 %or.i5977, %or.i6478
+  %and3284 = and i1 %and3183, %or.i6979
+  %and3385 = and i1 %and3284, %or.i7480
+  %and33 = zext i1 %and3385 to i32
+  ret i32 %and33
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
index fc28d7a..e1fd8a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
@@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
index 136ab64..668d3c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-node-with-multi-users.ll
@@ -10,12 +10,14 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index b5a3c57..acc04be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -94,17 +94,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
 
 define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
 ; SSE-LABEL: @logical_and_icmp_diff_preds(
-; SSE-NEXT:    [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT:    [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT:    [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT:    [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
-; SSE-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; SSE-NEXT:    [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false
-; SSE-NEXT:    [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; SSE-NEXT:    [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 1, i32 3, i32 6, i32 0>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; SSE-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; SSE-NEXT:    [[S3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
 ; SSE-NEXT:    ret i1 [[S3]]
 ;
 ; AVX-LABEL: @logical_and_icmp_diff_preds(
@@ -391,17 +387,28 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
 }
 
 define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_pred_diff(
+; SSE-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; SSE-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; SSE-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
+; SSE-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
+; SSE-NEXT:    ret i1 [[OP_RDX]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_pred_diff(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
+; AVX-NEXT:    ret i1 [[TMP8]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0
   %x1 = extractelement <4 x i32> %x, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index fb2b653..82085ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 46cca9b..1faeea7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
diff --git a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
index 66229c2..8b131cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
new file mode 100644
index 0000000..8e98851
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/non-probe-stale-profile-matching.prof
@@ -0,0 +1,23 @@
+main:9229397:0
+ 0: 0
+ 1: 0
+ 1.1: 47663
+ 1.2: 51871
+ 2: 48723
+ 3: 48723 bar:49018
+ 4: 49087
+ 5: 51871 bar:49588
+ 7: 0
+ 2: foo:1479916
+  1: 47663
+  1.1: 46683 bar:43238
+  2: 4519 bar:4932
+  3: 48723
+ 4: foo:1505537
+  1: 48604
+  1.1: 46965 bar:44479
+  2: 4613 bar:4967
+  3: 49087
+bar:2333388:196222
+ 0: 194449
+ 1: 194449
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
index ba4c611..d384794 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-profile.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7 _Z3barv:2 _Z3foov:5
- 6: 6 _Z3barv:4 _Z3foov:2
+ 4: 6
+ 6: 13
+ 3: 7 _Z3barv:2 _Z3foov:5
+ 5: 6 _Z3barv:4 _Z3foov:2
  !CFGChecksum: 563022570642068
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
index 62f9bd5..213bf0b 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-update.prof
@@ -1,8 +1,8 @@
 foo:3200:13
  1: 13
  2: 7
- 3: 6
- 4: 13
- 5: 7
- 6: 6
+ 4: 6
+ 6: 13
+ 7: 7
+ 9: 6
  !CFGChecksum: 844530426352218
diff --git a/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
new file mode 100644
index 0000000..eb69c18a
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/non-probe-stale-profile-matching.ll
@@ -0,0 +1,229 @@
+; REQUIRES: x86_64-linux
+; REQUIRES: asserts
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/non-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s
+
+; The profiled source code:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; The source code for the current build:
+
+;  volatile int x = 1;
+;  __attribute__((noinline)) int bar(int p) {
+;    return p;
+;  }
+
+;  __attribute__((always_inline)) int foo(int i, int p) {
+;    if (i % 10) return  bar(p);
+;    else return bar(p + 1);
+;  }
+
+;  int main() {
+;    if (x == 0)          // code change
+;      return 0;          // code change
+;    for (int i = 0; i < 1000 * 1000; i++) {
+;       x += foo(i, x);
+;       x += bar(x);
+;       if (i < 0)        // code change
+;         return 0;       // code change
+;       x += foo(i, x);
+;       x += bar(x);
+;    }
+;  }
+
+; CHECK: Run stale profile matching for bar
+
+; CHECK: Run stale profile matching for foo
+; CHECK: Callsite with callee:bar is matched from 1.1 to 1.1
+; CHECK: Callsite with callee:bar is matched from 2 to 2
+
+; CHECK: Run stale profile matching for main
+; CHECK: Callsite with callee:foo is matched from 4 to 2
+; CHECK: Callsite with callee:bar is matched from 5 to 3
+; CHECK: Callsite with callee:foo is matched from 8 to 4
+; CHECK: Callsite with callee:bar is matched from 9 to 5
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = dso_local global i32 1, align 4
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @bar(i32 noundef %p) #0 !dbg !9 {
+entry:
+  ret i32 %p, !dbg !13
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define dso_local i32 @foo(i32 noundef %i, i32 noundef %p) #1 !dbg !14 {
+entry:
+  %rem = srem i32 %i, 10, !dbg !15
+  %tobool = icmp ne i32 %rem, 0, !dbg !15
+  br i1 %tobool, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @bar(i32 noundef %p), !dbg !17
+  br label %return, !dbg !19
+
+if.else:                                          ; preds = %entry
+  %add = add nsw i32 %p, 1, !dbg !20
+  %call1 = call i32 @bar(i32 noundef %add), !dbg !21
+  br label %return, !dbg !22
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ], !dbg !23
+  ret i32 %retval.0, !dbg !24
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() #2 !dbg !25 {
+entry:
+  %0 = load volatile i32, ptr @x, align 4, !dbg !26, !tbaa !27
+  %cmp = icmp eq i32 %0, 0, !dbg !31
+  br i1 %cmp, label %if.then, label %if.end, !dbg !26
+
+if.then:                                          ; preds = %entry
+  br label %for.end, !dbg !32
+
+if.end:                                           ; preds = %entry
+  br label %for.cond, !dbg !33
+
+for.cond:                                         ; preds = %if.end6, %if.end
+  %i.0 = phi i32 [ 0, %if.end ], [ %inc, %if.end6 ], !dbg !34
+  %cmp1 = icmp slt i32 %i.0, 1000000, !dbg !35
+  br i1 %cmp1, label %for.body, label %for.cond.cleanup, !dbg !37
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %cleanup, !dbg !38
+
+for.body:                                         ; preds = %for.cond
+  %1 = load volatile i32, ptr @x, align 4, !dbg !40, !tbaa !27
+  %call = call i32 @foo(i32 noundef %i.0, i32 noundef %1), !dbg !41
+  %2 = load volatile i32, ptr @x, align 4, !dbg !42, !tbaa !27
+  %add = add nsw i32 %2, %call, !dbg !42
+  store volatile i32 %add, ptr @x, align 4, !dbg !42, !tbaa !27
+  %3 = load volatile i32, ptr @x, align 4, !dbg !43, !tbaa !27
+  %call2 = call i32 @bar(i32 noundef %3), !dbg !44
+  %4 = load volatile i32, ptr @x, align 4, !dbg !45, !tbaa !27
+  %add3 = add nsw i32 %4, %call2, !dbg !45
+  store volatile i32 %add3, ptr @x, align 4, !dbg !45, !tbaa !27
+  br i1 false, label %if.then5, label %if.end6, !dbg !46
+
+if.then5:                                         ; preds = %for.body
+  br label %cleanup, !dbg !47
+
+if.end6:                                          ; preds = %for.body
+  %5 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !27
+  %call7 = call i32 @foo(i32 noundef %i.0, i32 noundef %5), !dbg !49
+  %6 = load volatile i32, ptr @x, align 4, !dbg !50, !tbaa !27
+  %add8 = add nsw i32 %6, %call7, !dbg !50
+  store volatile i32 %add8, ptr @x, align 4, !dbg !50, !tbaa !27
+  %7 = load volatile i32, ptr @x, align 4, !dbg !51, !tbaa !27
+  %call9 = call i32 @bar(i32 noundef %7), !dbg !52
+  %8 = load volatile i32, ptr @x, align 4, !dbg !53, !tbaa !27
+  %add10 = add nsw i32 %8, %call9, !dbg !53
+  store volatile i32 %add10, ptr @x, align 4, !dbg !53, !tbaa !27
+  %inc = add nsw i32 %i.0, 1, !dbg !54
+  br label %for.cond, !dbg !56, !llvm.loop !57
+
+cleanup:                                          ; preds = %if.then5, %for.cond.cleanup
+  br label %for.end
+
+for.end:                                          ; preds = %cleanup, %if.then
+  ret i32 0, !dbg !61
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
+
+attributes #0 = { noinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #1 = { alwaysinline nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #2 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "path")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 19.0.0git"}
+!9 = distinct !DISubprogram(name: "bar", scope: !10, file: !10, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DIFile(filename: "test.c", directory: "path")
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 3, column: 3, scope: !9)
+!14 = distinct !DISubprogram(name: "foo", scope: !10, file: !10, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = !DILocation(line: 7, column: 9, scope: !14)
+!16 = !DILocation(line: 7, column: 7, scope: !14)
+!17 = !DILocation(line: 7, column: 23, scope: !18)
+!18 = !DILexicalBlockFile(scope: !14, file: !10, discriminator: 2)
+!19 = !DILocation(line: 7, column: 15, scope: !18)
+!20 = !DILocation(line: 8, column: 21, scope: !14)
+!21 = !DILocation(line: 8, column: 15, scope: !14)
+!22 = !DILocation(line: 8, column: 8, scope: !14)
+!23 = !DILocation(line: 0, scope: !14)
+!24 = !DILocation(line: 9, column: 1, scope: !14)
+!25 = distinct !DISubprogram(name: "main", scope: !10, file: !10, line: 11, type: !11, scopeLine: 11, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!26 = !DILocation(line: 12, column: 7, scope: !25)
+!27 = !{!28, !28, i64 0}
+!28 = !{!"int", !29, i64 0}
+!29 = !{!"omnipotent char", !30, i64 0}
+!30 = !{!"Simple C/C++ TBAA"}
+!31 = !DILocation(line: 12, column: 9, scope: !25)
+!32 = !DILocation(line: 13, column: 5, scope: !25)
+!33 = !DILocation(line: 14, column: 8, scope: !25)
+!34 = !DILocation(line: 14, scope: !25)
+!35 = !DILocation(line: 14, column: 21, scope: !36)
+!36 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 2)
+!37 = !DILocation(line: 14, column: 3, scope: !36)
+!38 = !DILocation(line: 14, column: 3, scope: !39)
+!39 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 4)
+!40 = !DILocation(line: 15, column: 18, scope: !25)
+!41 = !DILocation(line: 15, column: 11, scope: !25)
+!42 = !DILocation(line: 15, column: 8, scope: !25)
+!43 = !DILocation(line: 16, column: 15, scope: !25)
+!44 = !DILocation(line: 16, column: 11, scope: !25)
+!45 = !DILocation(line: 16, column: 8, scope: !25)
+!46 = !DILocation(line: 17, column: 10, scope: !25)
+!47 = !DILocation(line: 18, column: 8, scope: !25)
+!48 = !DILocation(line: 19, column: 18, scope: !25)
+!49 = !DILocation(line: 19, column: 11, scope: !25)
+!50 = !DILocation(line: 19, column: 8, scope: !25)
+!51 = !DILocation(line: 20, column: 15, scope: !25)
+!52 = !DILocation(line: 20, column: 11, scope: !25)
+!53 = !DILocation(line: 20, column: 8, scope: !25)
+!54 = !DILocation(line: 14, column: 37, scope: !55)
+!55 = !DILexicalBlockFile(scope: !25, file: !10, discriminator: 6)
+!56 = !DILocation(line: 14, column: 3, scope: !55)
+!57 = distinct !{!57, !58, !59, !60}
+!58 = !DILocation(line: 14, column: 3, scope: !25)
+!59 = !DILocation(line: 21, column: 3, scope: !25)
+!60 = !{!"llvm.loop.mustprogress"}
+!61 = !DILocation(line: 22, column: 1, scope: !25)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
index 4881937..43be142 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll
@@ -1,7 +1,9 @@
 ; REQUIRES: x86_64-linux
 ; REQUIRES: asserts
-; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline  -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl  -pass-remarks=inline 2>&1 | FileCheck %s
 
+; There is no profile-checksum-mismatch attr, even the checksum is mismatched in the pseudo_probe_desc, it doesn't run the matching.
+; CHECK-NOT: Run stale profile matching for main
 
 ; CHECK: Run stale profile matching for bar
 ; CHECK: Callsite with callee:baz is matched from 4 to 2
@@ -14,7 +16,7 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @main() #0 {
+define available_externally i32 @main() #0 {
   %1 = call i32 @bar(), !dbg !13
   ret i32 0
 }
@@ -47,7 +49,8 @@ attributes #1 = { "profile-checksum-mismatch" "use-sample-profile" }
 !9 = distinct !DICompileUnit(language: DW_LANG_C11, file: !10, producer: "clang version 19.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
 !10 = !DIFile(filename: "test2.c", directory: "/home/test", checksumkind: CSK_MD5, checksum: "553093afc026f9c73562eb3b0c5b7532")
 !11 = !{i32 2, !"Debug Info Version", i32 3}
-!12 = !{i64 -2624081020897602054, i64 281582081721716, !"main"}
+; Make a checksum mismatch in the pseudo_probe_desc
+!12 = !{i64 -2624081020897602054, i64 123456, !"main"}
 !13 = !DILocation(line: 8, column: 10, scope: !14)
 !14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 186646591)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 7, column: 40)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
index 4647a34f..f0b6fdf 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-dangle.ll
@@ -23,21 +23,21 @@ Merge:
 ; JT-LABEL-NO: T
 ; JT-LABEL-NO: F
 ; JT-LABEL: Merge
+; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4
 ; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3
-; JT-NOT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2
-; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+; JT: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; ASM-NOT: .pseudoprobe	6699318081062747564 4
 ; ASM-NOT: .pseudoprobe	6699318081062747564 3
-; ASM-NOT: .pseudoprobe	6699318081062747564 2
-; ASM: .pseudoprobe	6699318081062747564 4 0 0
+; ASM: .pseudoprobe	6699318081062747564 5 0 0
 	ret i32 %call
 }
 
 ;; Check block T and F are gone, and their probes (probe 2 and 3) are gone too.
 ; MIR-tail: bb.0
 ; MIR-tail: PSEUDO_PROBE [[#GUID:]], 1, 0, 0
-; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 2
 ; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 3
-; MIR-tail: PSEUDO_PROBE [[#GUID:]], 4, 0, 0
+; MIR-tail-NOT: PSEUDO_PROBE [[#GUID:]], 4
+; MIR-tail: PSEUDO_PROBE [[#GUID:]], 5, 0, 0
 
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 62f0737..97b0ed6 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -62,10 +62,10 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
 ; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
 
-           
+
 ; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
-; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646575)
+; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646559)
 ; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
-; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646583)
+; PROBE: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 186646567)
 ; PROBE: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; PROBE: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
index 822ab40..03bb64b 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-invoke.ll
@@ -18,10 +18,12 @@ entry:
 
 if.then:                                          ; preds = %entry
 ; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 2
+; callsite probe 3
   invoke void @_Z3foov()
           to label %invoke.cont unwind label %terminate.lpad, !dbg !24
 
 invoke.cont:                                      ; preds = %if.then
+; callsite probe 4
 ; CHECK-NOT: call void @llvm.pseudoprobe(i64 -1069303473483922844,
   invoke void @_Z3bazv()
           to label %invoke.cont1 unwind label %terminate.lpad, !dbg !26
@@ -31,7 +33,8 @@ invoke.cont1:                                     ; preds = %invoke.cont
   br label %if.end, !dbg !27
 
 if.else:                                          ; preds = %entry
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 3
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; callsite probe 6
   invoke void @_Z3foov()
           to label %invoke.cont2 unwind label %terminate.lpad, !dbg !28
 
@@ -40,7 +43,8 @@ invoke.cont2:                                     ; preds = %if.else
   br label %if.end
 
 if.end:                                           ; preds = %invoke.cont2, %invoke.cont1
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 4
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 7
+; callsite probe 8
   invoke void @_Z3foov()
           to label %invoke.cont3 unwind label %terminate.lpad, !dbg !29
 
@@ -51,14 +55,14 @@ invoke.cont3:                                     ; preds = %if.end
   br i1 %tobool4, label %if.then5, label %if.end6, !dbg !32
 
 if.then5:                                         ; preds = %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 5
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 9
   %2 = load volatile i32, ptr @x, align 4, !dbg !33, !tbaa !19
   %inc = add nsw i32 %2, 1, !dbg !33
   store volatile i32 %inc, ptr @x, align 4, !dbg !33, !tbaa !19
   br label %if.end6, !dbg !35
 
 if.end6:                                          ; preds = %if.then5, %invoke.cont3
-; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 6
+; CHECK: call void @llvm.pseudoprobe(i64 -1069303473483922844, i64 10
   ret void, !dbg !36
 
 terminate.lpad:                                   ; preds = %if.end, %if.else, %invoke.cont, %if.then
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
index 148f3ed..379dcfc 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile-metadata-2.ll
@@ -29,7 +29,7 @@ if.else:
   br label %return
 
 return:
-  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1)
+  call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -55,13 +55,12 @@ attributes #0 = {"use-sample-profile"}
 !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !5, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 !10 = !{!"function_entry_count", i64 14}
 !11 = !{!"branch_weights", i32 100, i32 0}
-;; A discriminator of 186646575 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 1.0.
-!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646575)
+;; A discriminator of 186646559 which is 0xB20001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 1.0.
+!12 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646559)
 !13 = distinct !DILocation(line: 10, column: 11, scope: !12)
-;; A discriminator of 134217775 which is 0x6f80057 in hexdecimal, stands for an indirect call probe
-;; with an index of 5 and probe factor of 0.
-!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217775)
+;; A discriminator of 134217759 which is 0x800001F in hexdecimal, stands for an indirect call probe
+;; with an index of 3 and probe factor of 0.
+!14 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 134217759)
 !15 = distinct !DILocation(line: 10, column: 11, scope: !14)
 !16 = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
index 474b666..867a49d 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-profile.ll
@@ -22,12 +22,12 @@ if.then:
 if.else:
   ; CHECK: call {{.*}}, !dbg ![[#PROBE2:]], !prof ![[PROF2:[0-9]+]]
   call void %f(i32 2)
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
   store i32 2, ptr %retval, align 4
   br label %return
 
 return:
-  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
+  ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
@@ -36,14 +36,14 @@ attributes #0 = {"use-sample-profile"}
 
 ; CHECK: ![[PD1]] = !{!"branch_weights", i32 8, i32 7}
 ; CHECK: ![[#PROBE1]] = !DILocation(line: 0, scope: ![[#SCOPE1:]])
+;; A discriminator of 119537695 which is 0x720001f in hexdecimal, stands for an indirect call probe
+;; with an index of 3.
+; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537695)
+; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
 ;; A discriminator of 119537711 which is 0x720002f in hexdecimal, stands for an indirect call probe
 ;; with an index of 5.
-; CHECK: ![[#SCOPE1]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
-; CHECK: ![[PROF1]] = !{!"VP", i32 0, i64 7, i64 9191153033785521275, i64 5, i64 -1069303473483922844, i64 2}
-;; A discriminator of 119537719 which is 0x7200037 in hexdecimal, stands for an indirect call probe
-;; with an index of 6.
 ; CHECK: ![[#PROBE2]] = !DILocation(line: 0, scope: ![[#SCOPE2:]])
-; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537719)
+; CHECK: ![[#SCOPE2]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 119537711)
 ; CHECK: ![[PROF2]] = !{!"VP", i32 0, i64 6, i64 -1069303473483922844, i64 4, i64 9191153033785521275, i64 2}
 
 !llvm.module.flags = !{!9, !10}
@@ -83,7 +83,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '7'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '5'
+;YAML-NEXT:    - ProbeId:         '3'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -113,7 +113,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '6'
+;YAML-NEXT:    - ProbeId:         '5'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -128,7 +128,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '6'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '3'
+;YAML-NEXT:    - ProbeId:         '4'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
@@ -143,7 +143,7 @@ attributes #0 = {"use-sample-profile"}
 ;YAML-NEXT:    - String:          'Applied '
 ;YAML-NEXT:    - NumSamples:      '13'
 ;YAML-NEXT:    - String:          ' samples from profile (ProbeId='
-;YAML-NEXT:    - ProbeId:         '4'
+;YAML-NEXT:    - ProbeId:         '6'
 ;YAML-NEXT:    - String:          ', Factor='
 ;YAML-NEXT:    - Factor:          '1.000000e+00'
 ;YAML-NEXT:    - String:          ', OriginalSamples='
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
index 992afed..217b619 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update.ll
@@ -14,15 +14,15 @@ T1:
 	%v1 = call i32 @f1()
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
 ;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -8513881372706734080)
     %cond3 = icmp eq i32 %v1, 412
 	br label %Merge
 F1:
 ; CHECK: %v2 = call i32 @f2(), !prof ![[#PROF2:]]
 	%v2 = call i32 @f2()
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 3, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
 ;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 8513881922462547968)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 8513881922462547968)
 	br label %Merge
 Merge:
 
@@ -30,11 +30,11 @@ Merge:
 	%B = phi i32 [%v1, %T1], [%v2, %F1]
 	br i1 %A, label %T2, label %F2
 T2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 7, i32 0, i64 -1)
 	call void @f3()
 	ret i32 %B
 F2:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 9, i32 0, i64 -1)
 	ret i32 %B
 }
 
@@ -42,4 +42,3 @@ F2:
 ; CHECK: ![[#PROF2]] = !{!"branch_weights", i32 6}
 
 attributes #0 = {"use-sample-profile"}
-
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
index f70e518..b622cfb 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-verify.ll
@@ -4,7 +4,7 @@
 
 ; VERIFY: *** Pseudo Probe Verification After LoopFullUnrollPass ***
 ; VERIFY: Function foo:
-; VERIFY-DAG: Probe 6	previous factor 1.00	current factor 5.00
+; VERIFY-DAG: Probe 5	previous factor 1.00	current factor 5.00
 ; VERIFY-DAG: Probe 4	previous factor 1.00	current factor 5.00
 
 declare void @foo2() nounwind
@@ -27,15 +27,15 @@ bb7.preheader:
 
 bb10:
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -1)
-; CHECK: call void @foo2(), !dbg ![[#PROBE6:]] 
+; CHECK: call void @foo2(), !dbg ![[#PROBE6:]]
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 2, i32 0, i64 -1)
   %indvars.iv = phi i64 [ 0, %bb7.preheader ], [ %indvars.iv.next, %bb10 ]
   %tmp1.14 = phi i32 [ %tmp1.06, %bb7.preheader ], [ %spec.select, %bb10 ]
@@ -50,14 +50,14 @@ bb10:
   br i1 %exitcond.not, label %bb3.loopexit, label %bb10, !llvm.loop !13
 
 bb24:
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 5, i32 0, i64 -1)
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 6, i32 0, i64 -1)
   ret void
 }
 
 ;; A discriminator of 186646583 which is 0xb200037 in hexdecimal, stands for a direct call probe
 ;; with an index of 6 and a scale of -1%.
 ; CHECK: ![[#PROBE6]] = !DILocation(line: 2, column: 20, scope: ![[#SCOPE:]])
-; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646583)
+; CHECK: ![[#SCOPE]] = !DILexicalBlockFile(scope: ![[#]], file: ![[#]], discriminator: 186646575)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
new file mode 100644
index 0000000..2031c2d
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+; standard vector concatenations
+
+define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = zext nneg <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext nneg <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
+
+define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
+; CHECK-LABEL: @concat_sext_v4i1_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i1> %a0 to <4 x i32>
+  %x1 = sext <4 x i1> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %r
+}
+
+define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
+; CHECK-NEXT:    ret <8 x ptr> [[R]]
+;
+  %x0 = inttoptr <4 x i32> %a0 to <4 x ptr>
+  %x1 = inttoptr <4 x i32> %a1 to <4 x ptr>
+  %r = shufflevector <4 x ptr> %x0, <4 x ptr> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x ptr> %r
+}
+
+define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
+; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    ret <16 x i64> [[R]]
+;
+  %x0 = ptrtoint <8 x ptr> %a0 to <8 x i64>
+  %x1 = ptrtoint <8 x ptr> %a1 to <8 x i64>
+  %r = shufflevector <8 x i64> %x0, <8 x i64> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i64> %r
+}
+
+define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: @concat_fpext_v4f32_v8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; SSE-NEXT:    ret <8 x double> [[R]]
+;
+; AVX-LABEL: @concat_fpext_v4f32_v8f64(
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %r
+}
+
+define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) {
+; CHECK-LABEL: @concat_fptrunc_v8f64_v16f32(
+; CHECK-NEXT:    [[X0:%.*]] = fptrunc <8 x double> [[A0:%.*]] to <8 x float>
+; CHECK-NEXT:    [[X1:%.*]] = fptrunc <8 x double> [[A1:%.*]] to <8 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x float> [[R]]
+;
+  %x0 = fptrunc <8 x double> %a0 to <8 x float>
+  %x1 = fptrunc <8 x double> %a1 to <8 x float>
+  %r = shufflevector <8 x float> %x0, <8 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+; commuted vector concatenation
+
+define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i32> %r
+}
+
+; interleaved shuffle
+
+define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    ret <8 x double> [[R]]
+;
+  %x0 = fpext <4 x float> %a0 to <4 x double>
+  %x1 = fpext <4 x float> %a1 to <4 x double>
+  %r = shufflevector <4 x double> %x0, <4 x double> %x1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %r
+}
+
+; negative - multiuse
+
+define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
+; CHECK-LABEL: @concat_trunc_v4i32_v8i16_multiuse(
+; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2:%.*]], align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %x0 = trunc <4 x i32> %a0 to <4 x i16>
+  %x1 = trunc <4 x i32> %a1 to <4 x i16>
+  %r = shufflevector <4 x i16> %x0, <4 x i16> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %x0, ptr %a2
+  ret <8 x i16> %r
+}
+
+; negative - bitcasts
+
+define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: @concat_bitcast_v4i32_v8f32(
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <4 x i32> [[A0:%.*]] to <4 x float>
+; CHECK-NEXT:    [[X1:%.*]] = bitcast <4 x i32> [[A1:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %x0 = bitcast <4 x i32> %a0 to <4 x float>
+  %x1 = bitcast <4 x i32> %a1 to <4 x float>
+  %r = shufflevector <4 x float> %x0, <4 x float> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+; negative - src type mismatch
+
+define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_v4i8_v4i16_v8i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i8> [[A0:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i16> [[A1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %x0 = sext <4 x i8> %a0 to <4 x i32>
+  %x1 = sext <4 x i16> %a1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+; negative - castop mismatch
+
+define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: @concat_sext_zext_v8i16_v16i32(
+; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = zext <8 x i16> %a1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %r
+}
diff --git a/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
new file mode 100644
index 0000000..435073d
--- /dev/null
+++ b/llvm/test/Verifier/module-flags-note-gnu-property-elf-pauthabi.ll
@@ -0,0 +1,19 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; CHECK: either both or no 'aarch64-elf-pauthabi-platform' and 'aarch64-elf-pauthabi-version' module flags must be present
+
+;--- err1.ll
+
+; RUN: not llvm-as err1.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-platform", i32 2}
+
+;--- err2.ll
+
+; RUN: not llvm-as err2.ll -o /dev/null 2>&1 | FileCheck %s
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"aarch64-elf-pauthabi-version", i32 31}
diff --git a/llvm/test/Verifier/pr69428.ll b/llvm/test/Verifier/pr69428.ll
new file mode 100644
index 0000000..be8733b
--- /dev/null
+++ b/llvm/test/Verifier/pr69428.ll
@@ -0,0 +1,48 @@
+; RUN: llvm-as -disable-output %s
+
+%struct._List_node_emplace_op2 = type { i8 }
+
+@"?_List@@3HA" = global i32 0, align 4
+
+define void @"?ExecutionEngineaddExecutableDependency@@YAXXZ"() personality ptr @__CxxFrameHandler3 {
+entry:
+  %agg.tmp.ensured.i = alloca %struct._List_node_emplace_op2, align 1
+  %0 = load i32, ptr @"?_List@@3HA", align 4
+  %call.i = call noundef ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr %agg.tmp.ensured.i, i32 %0)
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i unwind label %ehcleanup.i
+
+invoke.cont.i:                                    ; preds = %entry
+  invoke void @llvm.seh.scope.end()
+          to label %invoke.cont2.i unwind label %ehcleanup.i
+
+invoke.cont2.i:                                   ; preds = %invoke.cont.i
+  call void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6
+  unreachable
+
+ehcleanup.i:                                      ; preds = %invoke.cont.i, %entry
+  %1 = cleanuppad within none []
+  invoke void @llvm.seh.scope.begin()
+          to label %invoke.cont.i.i unwind label %ehcleanup.i.i
+
+invoke.cont.i.i:                                  ; preds = %ehcleanup.i
+  invoke void @llvm.seh.scope.end()
+          to label %"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i" unwind label %ehcleanup.i.i
+
+ehcleanup.i.i:                                    ; preds = %invoke.cont.i.i, %ehcleanup.i
+  %2 = cleanuppad within %1 []
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %2) ]
+  cleanupret from %2 unwind to caller
+
+"??1?$_List_node_emplace_op2@H@@QEAA@XZ.exit.i":  ; preds = %invoke.cont.i.i
+  call void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr %agg.tmp.ensured.i) #6 [ "funclet"(token %1) ]
+  cleanupret from %1 unwind to caller
+}
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @llvm.seh.scope.begin()
+declare void @llvm.seh.scope.end()
+
+declare void @"??1?$_List_node_emplace_op2@H@@QEAA@XZ"(ptr)
+declare void @"??1_Alloc_construct_ptr@@QEAA@XZ"(ptr)
+declare ptr @"??0?$_List_node_emplace_op2@H@@QEAA@H@Z"(ptr, i32)
diff --git a/llvm/test/tools/dsymutil/ARM/firmware.test b/llvm/test/tools/dsymutil/ARM/firmware.test
new file mode 100644
index 0000000..128faa5
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/firmware.test
@@ -0,0 +1,11 @@
+$ cat test.c
+int main() {
+  return 0;
+}
+
+$ xcrun clang -O0 -target arm64-apple-unknown-macho test.c -c -o test.o
+$ xcrun ld -arch arm64 -o test.out test.o -platform_version firmware 0 0
+
+RUN: dsymutil -oso-prepend-path %p/../Inputs %p/../Inputs/private/tmp/firmware/test.out -o %t.dSYM
+RUN: llvm-objdump -h %t.dSYM/Contents/Resources/DWARF/test.out | FileCheck %s
+CHECK: file format mach-o arm64
diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o
new file mode 100644
index 0000000..3bc83ca
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.o
diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out
new file mode 100755
index 0000000..21fe4d2
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/private/tmp/firmware/test.out
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
index 98b8619..1b196b4 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      14    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      19    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      20    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      2     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      17    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     257.00 215.25 235.25 176.17 176.17 38.00  424.25 2.25   12.67
+# CHECK-NEXT:  -     257.00 216.25 247.25 173.17 173.17 38.00  424.25 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
index a2899b4..4865121 100644
--- a/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Broadwell/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      14    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      14    2.00                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    2.00    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      19    2.00    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      15    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      7     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      11    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.33  22.33  25.67  25.67  5.00   80.33   -     1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
index 376070d..05c4760 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      20    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      20    2.00    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      14    2.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      21    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      21    2.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      2     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1632,17 +1632,17 @@ vzeroupper
 # CHECK-NEXT:  4      18    2.00    *                   vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      6     2.00                        vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  3      13    2.00    *                   vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  1      6     0.50                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      6     0.50                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     2.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  2      11    1.00    *                   vrsqrtps	(%rax), %xmm2
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     336.00 214.58 236.58 176.17 176.17 38.00  427.58 2.25   12.67
+# CHECK-NEXT:  -     336.00 215.58 248.58 173.17 173.17 38.00  427.58 3.25   12.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
@@ -2342,17 +2342,17 @@ vzeroupper
 # CHECK-NEXT:  -      -     2.33   0.33   0.50   0.50    -     0.33    -      -     vrcpps	(%rax), %ymm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundpd	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundps	$1, %ymm0, %ymm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundps	$1, (%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -     vrsqrtps	%xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -     vrsqrtps	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
index 70d9398..62dfa23 100644
--- a/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      14    2.00                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      20    2.00    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      20    2.00    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -243,13 +243,13 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      16    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  2      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  3      8     1.00    *                   ptest	(%rax), %xmm1
-# CHECK-NEXT:  1      6     0.50                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      6     0.50                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00                        roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  3      12    2.00    *                   roundss	$1, (%rax), %xmm2
 
 # CHECK:      Resources:
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     23.33  22.33  25.67  25.67  5.00   80.33   -     1.67
+# CHECK-NEXT:  -      -     23.83  30.33  23.67  23.67  5.00   80.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00   1.00    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     2.00   1.00   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.50   1.00   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
@@ -358,11 +358,11 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     2.00    -     0.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -     1.00    -      -     ptest	(%rax), %xmm1
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundpd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundpd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundps	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundsd	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundsd	$1, (%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     2.00    -      -      -      -      -      -     roundss	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -     2.00   0.50   0.50    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
index c2e0217..ef5a9e3 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-avx1.s
@@ -1189,9 +1189,9 @@ vzeroupper
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      13    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      13    1.50                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  6      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 338.58 199.58 173.83 173.83 38.00  326.58 5.25   11.33
+# CHECK-NEXT:  -     126.00 339.58 199.58 173.83 173.83 38.00  326.58 6.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1899,9 +1899,9 @@ vzeroupper
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
index 6e11bb6..1d8d67f 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/resources-sse41.s
@@ -166,7 +166,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  4      13    1.50                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    1.50    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     37.33  31.33  23.67  23.67  5.00   63.33   -     1.67
+# CHECK-NEXT:  -      -     37.83  31.33  23.67  23.67  5.00   63.33  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -281,7 +281,7 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00    -      -      -     1.00    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     1.00    -      -     dppd	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.50   1.50   0.50   0.50    -     1.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
index de14ef7..cabb002 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s
@@ -1188,10 +1188,10 @@ vzeroupper
 # CHECK-NEXT:  2      16    3.00    *                   vdivss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  3      9     1.00                        vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      13    1.33                        vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  5      19    1.33    *                   vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      13    1.33                        vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  5      20    1.33    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  4      13    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      13    1.50                        vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  6      20    1.50    *                   vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  2      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  2      3     1.00                        vextractps	$1, %xmm0, %ecx
@@ -1736,7 +1736,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     126.00 322.25 200.25 173.83 173.83 38.00  330.25 6.25   11.33
+# CHECK-NEXT:  -     126.00 325.25 202.25 173.83 173.83 38.00  326.25 7.25   11.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -1898,10 +1898,10 @@ vzeroupper
 # CHECK-NEXT:  -     3.00   1.00    -     0.50   0.50    -      -      -      -     vdivss	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     1.67    -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     1.67    -      -     vdppd	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     vdpps	$22, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     vdpps	$22, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     vextractf128	$1, %ymm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00    -      -     0.33   vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     vextractps	$1, %xmm0, %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
index 15cd09b..e3f34fd 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse41.s
@@ -165,8 +165,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  3      8     0.67    *                   blendvps	%xmm0, (%rax), %xmm2
 # CHECK-NEXT:  3      9     1.00                        dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  4      15    1.00    *                   dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  4      13    1.33                        dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  5      19    1.33    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  4      13    1.50                        dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  6      19    1.50    *                   dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  2      3     1.00                        extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  3      2     1.00           *            extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
@@ -266,7 +266,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -      -     36.67  28.67  23.67  23.67  5.00   66.67   -     1.67
+# CHECK-NEXT:  -      -     38.17  29.67  23.67  23.67  5.00   64.67  0.50   1.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -280,8 +280,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     0.67    -      -     blendvps	%xmm0, (%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     1.67    -      -     dppd	$22, %xmm0, %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67   0.50   0.50    -     1.67    -      -     dppd	$22, (%rax), %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00    -      -      -     2.00    -      -     dpps	$22, %xmm0, %xmm2
-# CHECK-NEXT:  -      -     1.00   1.00   0.50   0.50    -     2.00    -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -     1.50   1.50    -      -      -     1.00    -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  -      -     2.00   1.50   0.50   0.50    -     1.00   0.50    -     dpps	$22, (%rax), %xmm2
 # CHECK-NEXT:  -      -     1.00    -      -      -      -     1.00    -      -     extractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  -      -      -      -     0.33   0.33   1.00   1.00    -     0.33   extractps	$1, %xmm0, (%rax)
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -     insertps	$1, %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
index 4654ce1..349abec 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/dependency-breaking-gpr.s
@@ -68,12 +68,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -145,12 +145,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -223,12 +223,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -306,12 +306,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -472,12 +472,12 @@ cmovael %eax, %ecx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
index 12d6f39..0fcd6f5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s
@@ -46,12 +46,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -122,12 +122,12 @@ add %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
index 93f8d76..cd427bb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
@@ -41,12 +41,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -112,12 +112,12 @@ mulxq %rax, %rax, %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
index 13ef5bc..bf82486 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
@@ -43,12 +43,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -115,12 +115,12 @@ mulxq (%rdi), %rax, %rdx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
index bfe8be8..8a5a014 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-same-regs.s
@@ -44,12 +44,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -116,12 +116,12 @@ mulxq %rax, %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
index 1431875..f0e16a8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-xmm.s
@@ -68,12 +68,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
index eb2bb97..97f6a34 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-avx-ymm.s
@@ -68,12 +68,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ vpaddq %ymm0, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
index 5909af8..c733f63 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-mmx.s
@@ -63,12 +63,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -154,12 +154,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -245,12 +245,12 @@ paddd %mm0, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
index 5a05487..63df99e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/one-idioms-sse-xmm.s
@@ -68,12 +68,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -159,12 +159,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -250,12 +250,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -341,12 +341,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
index 7ac674c..66c1322 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-3.s
@@ -40,12 +40,12 @@ xor %bx, %dx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
index 582da14..4ed529e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-4.s
@@ -40,12 +40,12 @@ add %cx, %bx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
index dda87e9..5894111 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-5.s
@@ -33,12 +33,12 @@ lzcnt %ax, %bx  ## partial register stall.
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
index 71520ea..fdbf4d9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/partial-reg-update-6.s
@@ -42,12 +42,12 @@ lzcnt 2(%rsp), %cx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
index 7afa80c..f3e515c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-xmm.s
@@ -180,12 +180,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
index 8b81d55..a484a75 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-avx-ymm.s
@@ -180,12 +180,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ vmovdqu %ymm15, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
index f359048..eb20d13 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-gpr.s
@@ -134,12 +134,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -402,12 +402,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -670,12 +670,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -938,12 +938,12 @@ xchgq %r15, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
index b556fd6..e17d671 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-mmx.s
@@ -61,12 +61,12 @@ movq %mm7, %mm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
index 147cb0f..b45fd17 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-sse-xmm.s
@@ -180,12 +180,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -474,12 +474,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -768,12 +768,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1062,12 +1062,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1356,12 +1356,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1650,12 +1650,12 @@ movdqu %xmm15, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
index de59edf..0465d41 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/reg-move-elimination-x87.s
@@ -67,12 +67,12 @@ fxch %st(0)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
index 4e024e5..9c5a19b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-adx.s
@@ -38,12 +38,12 @@ adox        (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
index 5abf3cc..d108696 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-aes.s
@@ -50,12 +50,12 @@ aeskeygenassist $22, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
index 146b3ce..4f0b484 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx1.s
@@ -1731,12 +1731,12 @@ vzeroupper
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
index 3c6b31a..1a8b9e2 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-avx2.s
@@ -771,12 +771,12 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
index 8c0e841..2600237 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi1.s
@@ -85,12 +85,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
index 8d00c99..0664c1d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-bmi2.s
@@ -100,12 +100,12 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
index 3e7219c..b40d155 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clflushopt.s
@@ -23,12 +23,12 @@ clflushopt (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
index 0dc89fa..0f9935c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-clzero.s
@@ -23,12 +23,12 @@ clzero
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
index e0e46af..8118e40 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmov.s
@@ -218,12 +218,12 @@ cmovgq    (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
index 03763e5..9ab8776 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-cmpxchg.s
@@ -25,12 +25,12 @@ cmpxchg16b (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
index bb995d5..345ae02 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-f16c.s
@@ -40,12 +40,12 @@ vcvtps2ph   $0, %ymm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
index 9af180d..af207f0 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fma.s
@@ -500,12 +500,12 @@ vfnmsub231ss (%rax), %xmm1, %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
index 142508c..3e65183 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-fsgsbase.s
@@ -40,12 +40,12 @@ wrgsbase %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
index 1545a22..0257202 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lea.s
@@ -293,12 +293,12 @@ lea 1024(%rax, %rbx, 2), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
index ffbe414..735287a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-lzcnt.s
@@ -35,12 +35,12 @@ lzcntq      (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
index 75dbf95..2bc6177 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mmx.s
@@ -279,12 +279,12 @@ pxor        (%rax), %mm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
index 144e97f..6eeabbd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-movbe.s
@@ -35,12 +35,12 @@ movbe  (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
index 3b343d7..103fd3eb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-mwaitx.s
@@ -25,12 +25,12 @@ mwaitx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
index 2d9f0e9..893f476 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-pclmul.s
@@ -25,12 +25,12 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
index cce078f..29bcc5c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-popcnt.s
@@ -35,12 +35,12 @@ popcntq     (%rax), %rcx
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
index 5423b6b..b80e8f7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-prefetchw.s
@@ -25,12 +25,12 @@ prefetchw   (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
index fb09253..649eb10 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdrand.s
@@ -27,12 +27,12 @@ rdrand   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
index f10a90f..44e0eeb 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-rdseed.s
@@ -27,12 +27,12 @@ rdseed   %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
index 360a667..e6d5ab9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sha.s
@@ -55,12 +55,12 @@ sha256rnds2 (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
index 9816b87..4c7a3f0 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse1.s
@@ -328,12 +328,12 @@ xorps       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
index f69c535..d24aebf 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse2.s
@@ -684,12 +684,12 @@ xorpd       (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
index 8110390..51bb95f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse3.s
@@ -74,12 +74,12 @@ mwait
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
index 0cc6c6a..e952a16 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse41.s
@@ -261,12 +261,12 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
index 873e4f4..8afcd80 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse42.s
@@ -70,12 +70,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
index 1c1b0b2..6606a3e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-sse4a.s
@@ -35,12 +35,12 @@ movntss     %xmm0, (%rax)
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
index aeec493..6668870 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-ssse3.s
@@ -180,12 +180,12 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
index 076094f..81afc7d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vaes.s
@@ -40,12 +40,12 @@ vaesenclast      (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
index 31680d5..10440e9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-vpclmulqdq.s
@@ -25,12 +25,12 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
index fb09b65..8f627ca 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_32.s
@@ -56,12 +56,12 @@ salc
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
index fedb3d2..41ec631 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x86_64.s
@@ -1957,12 +1957,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
index 9a92bd0..cd8a06a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-x87.s
@@ -364,12 +364,12 @@ fyl2xp1
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
index 819361c..f348ff8 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/resources-xsave.s
@@ -35,12 +35,12 @@ xsetbv
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
index 33657e6..ed4e8f9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-xmm.s
@@ -138,12 +138,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ vpaddq %xmm0, %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
index ba7f51e..2404336 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-avx-ymm.s
@@ -148,12 +148,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -239,12 +239,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -330,12 +330,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -421,12 +421,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -512,12 +512,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -603,12 +603,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -694,12 +694,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -785,12 +785,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -876,12 +876,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -967,12 +967,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1058,12 +1058,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1149,12 +1149,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1240,12 +1240,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1331,12 +1331,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1422,12 +1422,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1513,12 +1513,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1604,12 +1604,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1695,12 +1695,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1786,12 +1786,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1878,12 +1878,12 @@ vpxor %ymm1, %ymm0, %ymm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
index 018adc2..4d648f7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-gpr.s
@@ -68,12 +68,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -175,12 +175,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -282,12 +282,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -389,12 +389,12 @@ addq %rax, %rax
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
index 935881a..aca39c5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/zero-idioms-sse-xmm.s
@@ -138,12 +138,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -229,12 +229,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -320,12 +320,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -411,12 +411,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -502,12 +502,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -593,12 +593,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -684,12 +684,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -775,12 +775,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -866,12 +866,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -957,12 +957,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1048,12 +1048,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1139,12 +1139,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1230,12 +1230,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1321,12 +1321,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1412,12 +1412,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1503,12 +1503,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1594,12 +1594,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
@@ -1685,12 +1685,12 @@ paddq %xmm0, %xmm0
 # CHECK-NEXT: [5]   - Zn3ALU2
 # CHECK-NEXT: [6]   - Zn3ALU3
 # CHECK-NEXT: [7]   - Zn3BRU1
-# CHECK-NEXT: [8]   - Zn3FPP0
-# CHECK-NEXT: [9]   - Zn3FPP1
-# CHECK-NEXT: [10]  - Zn3FPP2
-# CHECK-NEXT: [11]  - Zn3FPP3
-# CHECK-NEXT: [12.0] - Zn3FPP45
-# CHECK-NEXT: [12.1] - Zn3FPP45
+# CHECK-NEXT: [8]   - Zn3FP0
+# CHECK-NEXT: [9]   - Zn3FP1
+# CHECK-NEXT: [10]  - Zn3FP2
+# CHECK-NEXT: [11]  - Zn3FP3
+# CHECK-NEXT: [12.0] - Zn3FP45
+# CHECK-NEXT: [12.1] - Zn3FP45
 # CHECK-NEXT: [13]  - Zn3FPSt
 # CHECK-NEXT: [14.0] - Zn3LSU
 # CHECK-NEXT: [14.1] - Zn3LSU
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
new file mode 100644
index 0000000..064ffca
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections-within-segment.s
@@ -0,0 +1,38 @@
+## Disallow (de)compression for sections within a segment as they are
+## effectively immutable.
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: yaml2obj %s -o a
+# RUN: not llvm-objcopy a /dev/null --compress-sections .text=zlib 2>&1 | FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: 'a': section '.text' within a segment cannot be (de)compressed
+
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=none 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=error:
+
+# CHECK2: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+## There is an error even if 'foo' is already compressed with zlib.
+# RUN: not llvm-objcopy a /dev/null --compress-sections foo=zlib 2>&1 | FileCheck %s --check-prefix=CHECK3 --implicit-check-not=error:
+
+# CHECK3: error: 'a': section 'foo' within a segment cannot be (de)compressed
+
+--- !ELF
+FileHeader:
+  Class:      ELFCLASS64
+  Data:       ELFDATA2LSB
+  Type:       ET_EXEC
+  Machine:    EM_X86_64
+ProgramHeaders:
+  - Type:     PT_LOAD
+    FirstSec: .text
+    LastSec:  foo
+    Align:    0x1000
+    Offset:   0x1000
+Sections:
+  - Name:     .text
+    Type:     SHT_PROGBITS
+    Offset:   0x1000
+    Content:  C3
+  - Name:     foo
+    Type:     SHT_PROGBITS
+    Flags:    [ SHF_COMPRESSED ]
+    Content:  010000000000000040000000000000000100000000000000789cd36280002d3269002f800151
diff --git a/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
new file mode 100644
index 0000000..e6fa860
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/compress-sections.s
@@ -0,0 +1,128 @@
+# REQUIRES: x86-registered-target, zlib, zstd
+
+# RUN: rm -rf %t && mkdir %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o
+## '*0=none' wins because it is the last. '*0' sections are decompressed (if originally compressed) or kept unchanged (if uncompressed).
+## No section is named 'nomatch'. The third option is a no-op.
+# RUN: llvm-objcopy a.o out --compress-sections='*0=zlib' --compress-sections '*0=none' --compress-sections 'nomatch=none' 2>&1 | count 0
+# RUN: llvm-readelf -S out | FileCheck %s --check-prefix=CHECK1
+
+# CHECK1:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK1:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK1:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK1-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK1-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK1:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   7  8
+# CHECK1-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+## Mixing zlib and zstd.
+# RUN: llvm-objcopy a.o out2 --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -Sr -x nonalloc0 -x .debug_str out2 2>&1 | FileCheck %s --check-prefix=CHECK2
+# RUN: llvm-readelf -z -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2DE
+
+# CHECK2:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK2:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
+# CHECK2:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   3  8
+# CHECK2-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
+# CHECK2-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  I  11   5  8
+# CHECK2:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK2-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK2-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  8
+
+## llvm-readelf -r doesn't support SHF_COMPRESSED SHT_RELA.
+# CHECK2: warning: {{.*}}: unable to read relocations from SHT_RELA section with index 8: section [index 8] has an invalid sh_size ([[#]]) which is not a multiple of its sh_entsize (24)
+
+# CHECK2:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+# CHECK2-NEXT: 08000000 00000000 {{.*}}
+# CHECK2:      Hex dump of section '.debug_str':
+## zstd with ch_size=0x38
+# CHECK2-NEXT: 02000000 00000000 38000000 00000000
+# CHECK2-NEXT: 01000000 00000000 {{.*}}
+
+# CHECK2DE:       Hex dump of section 'nonalloc0':
+# CHECK2DE-NEXT:  0x00000000 00000000 00000000 00000000 00000000 ................
+# CHECK2DE-EMPTY:
+# CHECK2DE-NEXT:  Hex dump of section '.debug_str':
+# CHECK2DE-NEXT:  0x00000000 41414141 41414141 41414141 41414141 AAAAAAAAAAAAAAAA
+
+## --decompress-debug-sections takes precedence, even if it is before --compress-sections.
+# RUN: llvm-objcopy a.o out3 --decompress-debug-sections --compress-sections .debug_str=zstd
+# RUN: llvm-readelf -S out3 | FileCheck %s --check-prefix=CHECK3
+
+# CHECK3:      .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
+
+# RUN: llvm-objcopy a.o out4 --compress-sections '*0=zlib'
+# RUN: llvm-readelf -S out4 | FileCheck %s --check-prefix=CHECK4
+
+# CHECK4:      Name           Type          Address     Off      Size     ES Flg Lk Inf Al
+# CHECK4:      .text          PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00  AX  0   0  4
+# CHECK4:      foo0           PROGBITS [[#%x,FOO0:]]    [[#%x,]] [[#%x,]] 00  AC  0   0  8
+# CHECK4-NEXT: .relafoo0      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   3  8
+# CHECK4-NEXT: foo1           PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00   A  0   0  8
+# CHECK4-NEXT: .relafoo1      RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18   I 11   5  8
+# CHECK4:      nonalloc0      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  8
+# CHECK4-NEXT: .relanonalloc0 RELA     [[#%x,]]         [[#%x,]] [[#%x,]] 18  IC 11   7  8
+# CHECK4-NEXT: nonalloc1      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK4-NEXT: .debug_str     PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01  MS  0   0  1
+
+## If a section is already compressed, compression request for another format is ignored.
+# RUN: llvm-objcopy a.o out5 --compress-sections 'nonalloc0=zlib'
+# RUN: llvm-readelf -x nonalloc0 out5 | FileCheck %s --check-prefix=CHECK5
+# RUN: llvm-objcopy out5 out5a --compress-sections 'nonalloc0=zstd'
+# RUN: cmp out5 out5a
+
+# CHECK5:      Hex dump of section 'nonalloc0':
+## zlib with ch_size=0x10
+# CHECK5-NEXT: 01000000 00000000 10000000 00000000
+# CHECK5-NEXT: 08000000 00000000 {{.*}}
+
+# RUN: not llvm-objcopy --compress-sections=foo a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+# ERR1:      error: --compress-sections: parse error, not 'section-glob=[none|zlib|zstd]'
+
+# RUN: llvm-objcopy --compress-sections 'a[=zlib' a.o out 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+# ERR2:      warning: invalid glob pattern, unmatched '['
+
+# RUN: not llvm-objcopy a.o out --compress-sections='.debug*=zlib-gabi' --compress-sections='.debug*=' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR3 %s
+# ERR3:      error: invalid or unsupported --compress-sections format: .debug*=zlib-gabi
+
+# RUN: not llvm-objcopy a.o out --compress-sections='!.debug*=zlib' 2>&1 | \
+# RUN:   FileCheck -check-prefix=ERR4 %s
+# ERR4:      error: --compress-sections: negative pattern is unsupported
+
+.globl _start
+_start:
+  ret
+
+.section foo0,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section foo1,"a"
+.balign 8
+.quad .text-.
+.quad .text-.
+.section nonalloc0,""
+.balign 8
+.quad .text+1
+.quad .text+2
+sym0:
+.section nonalloc1,""
+.balign 8
+.quad 42
+sym1:
+
+.section .debug_str,"MS",@progbits,1
+.Linfo_string0:
+  .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
+.Linfo_string1:
+  .asciz "BBBBBBBBBBBBBBBBBBBBBBBBBBB"
diff --git a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
index 4258ddb..d9f4f38 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/decompress-sections.test
@@ -4,6 +4,8 @@
 # RUN: yaml2obj %s -o %t
 # RUN: llvm-objcopy --decompress-debug-sections %t %t.de
 # RUN: llvm-readelf -S %t.de | FileCheck %s
+# RUN: llvm-objcopy --compress-sections '*nonalloc=none' --compress-sections .debugx=none %t %t.1.de
+# RUN: cmp %t.de %t.1.de
 
 # CHECK:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
 # CHECK:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  AC  0   0  0
@@ -11,6 +13,33 @@
 # CHECK-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
 # CHECK-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
 
+# RUN: llvm-objcopy --compress-sections '.debug*=none' %t %t2.de
+# RUN: llvm-readelf -S -x .debug_alloc -x .debug_nonalloc -x .debugx %t2.de | FileCheck %s --check-prefix=CHECK2
+
+# CHECK2:        Name              Type            Address          Off      Size     ES Flg Lk Inf Al
+# CHECK2:        .debug_alloc      PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00  A   0   0  1
+# CHECK2-NEXT:   .debug_nonalloc   PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   .debugx           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  1
+# CHECK2-NEXT:   nodebug           PROGBITS        0000000000000000 [[#%x,]] [[#%x,]] 00   C  0   0  0
+
+# CHECK2:       Hex dump of section '.debug_alloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2:       Hex dump of section '.debug_nonalloc':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-EMPTY:
+# CHECK2-NEXT:  Hex dump of section '.debugx':
+# CHECK2-NEXT:  0x00000000 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000010 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000020 2a000000 00000000 2a000000 00000000 *.......*.......
+# CHECK2-NEXT:  0x00000030 2a000000 00000000 2a000000 00000000 *.......*.......
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
index ece36c6..5600bcd 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-cov5.s
@@ -6,6 +6,13 @@
 ; RUN:   | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj > %t.o
 ; RUN: llvm-objdump --disassemble-symbols=kernel.kd %t.o | FileCheck %s --check-prefixes=COV4,CHECK
 
+;; Make sure we override the default COV in the disassembler on COV6 (there
+;; currently aren't any differences between 5 and 6, so set the default to 4 so
+;; we can verify that the default is at least overridden)
+; RUN: sed 's/CODE_OBJECT_VERSION/6/g' %s \
+; RUN:   | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack,+wavefrontsize32,-wavefrontsize64 -filetype=obj > %t.o
+; RUN: llvm-objdump -mllvm --amdhsa-code-object-version=4 --disassemble-symbols=kernel.kd %t.o | FileCheck %s --check-prefixes=COV5,CHECK
+
 ;; Verify that .amdhsa_uses_dynamic_stack is only printed on COV5+.
 
 ; CHECK: .amdhsa_kernel kernel
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
index f28d92e..5125317 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-feature-pauth.s
@@ -1,98 +1,204 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag.s       -o tag.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o tag-short.o
-# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-long.s  -o tag-long.o
-
-# RUN: llvm-readelf --notes tag.o       | FileCheck --check-prefix NORMAL %s
-# RUN: llvm-readelf --notes tag-short.o | FileCheck --check-prefix SHORT  %s
-# RUN: llvm-readelf --notes tag-long.o  | FileCheck --check-prefix LONG   %s
-
-# NORMAL: AArch64 PAuth ABI tag: platform 0x2a, version 0x1
-# SHORT:  AArch64 PAuth ABI tag: <corrupted size: expected at least 16, got 12>
-# LONG:   AArch64 PAuth ABI tag: platform 0x2a, version 0x1, additional info 0xEFCDAB8967452301
-
-# RUN: llvm-readobj --notes tag.o       | FileCheck --check-prefix LLVM-NORMAL %s
-# RUN: llvm-readobj --notes tag-short.o | FileCheck --check-prefix LLVM-SHORT %s
-# RUN: llvm-readobj --notes tag-long.o  | FileCheck --check-prefix LLVM-LONG %s
-
-# LLVM-SHORT:      Notes [
-# LLVM-SHORT-NEXT:   NoteSection {
-# LLVM-SHORT-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-SHORT-NEXT:     Offset: 0x40
-# LLVM-SHORT-NEXT:     Size: 0x1C
-# LLVM-SHORT-NEXT:     Note {
-# LLVM-SHORT-NEXT:       Owner: ARM
-# LLVM-SHORT-NEXT:       Data size: 0xC
-# LLVM-SHORT-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-SHORT-NEXT:       Description data (
-# LLVM-SHORT-NEXT:         0000: 2A000000 00000000 01000000
-# LLVM-SHORT-NEXT:       )
-# LLVM-SHORT-NEXT:     }
-# LLVM-SHORT-NEXT:   }
-# LLVM-SHORT-NEXT: ]
-
-# LLVM-NORMAL:      Notes [
-# LLVM-NORMAL-NEXT:   NoteSection {
-# LLVM-NORMAL-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-NORMAL-NEXT:     Offset: 0x40
-# LLVM-NORMAL-NEXT:     Size: 0x20
-# LLVM-NORMAL-NEXT:     Note {
-# LLVM-NORMAL-NEXT:       Owner: ARM
-# LLVM-NORMAL-NEXT:       Data size: 0x10
-# LLVM-NORMAL-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-NORMAL-NEXT:       Platform: 42
-# LLVM-NORMAL-NEXT:       Version: 1
-# LLVM-NORMAL-NEXT:     }
-# LLVM-NORMAL-NEXT:   }
-# LLVM-NORMAL-NEXT: ]
-
-# LLVM-LONG:      Notes [
-# LLVM-LONG-NEXT:   NoteSection {
-# LLVM-LONG-NEXT:     Name: .note.AARCH64-PAUTH-ABI-tag
-# LLVM-LONG-NEXT:     Offset: 0x40
-# LLVM-LONG-NEXT:     Size: 0x28
-# LLVM-LONG-NEXT:     Note {
-# LLVM-LONG-NEXT:       Owner: ARM
-# LLVM-LONG-NEXT:       Data size: 0x18
-# LLVM-LONG-NEXT:       Type: NT_ARM_TYPE_PAUTH_ABI_TAG
-# LLVM-LONG-NEXT:       Platform: 42
-# LLVM-LONG-NEXT:       Version: 1
-# LLVM-LONG-NEXT:       Additional info: EFCDAB8967452301
-# LLVM-LONG-NEXT:     }
-# LLVM-LONG-NEXT:   }
-# LLVM-LONG-NEXT: ]
-
-#--- abi-tag.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 16
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-
-#--- abi-tag-short.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 12
-.long 1
-.asciz "ARM"
-
-.quad 42
-.word 1
-
-#--- abi-tag-long.s
-
-.section ".note.AARCH64-PAUTH-ABI-tag", "a"
-.long 4
-.long 24
-.long 1
-.asciz "ARM"
-
-.quad 42         // platform
-.quad 1          // version
-.quad 0x0123456789ABCDEF // extra data
+#--- gnu-42-1.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-42-1.s -o gnu-42-1.o
+# RUN: llvm-readelf --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+# RUN: llvm-readobj --notes gnu-42-1.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x2a (unknown)" -DVERSION=0x1 %s
+
+# ELF: Displaying notes found in: .note.gnu.property
+# ELF-NEXT:   Owner                 Data size	Description
+# ELF-NEXT:   GNU                   0x00000018	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-NEXT:   AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+
+# OBJ:      Notes [
+# OBJ-NEXT:   NoteSection {
+# OBJ-NEXT:     Name: .note.gnu.property
+# OBJ-NEXT:     Offset: 0x40
+# OBJ-NEXT:     Size: 0x28
+# OBJ-NEXT:     Note {
+# OBJ-NEXT:       Owner: GNU
+# OBJ-NEXT:       Data size: 0x18
+# OBJ-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-NEXT:       Property [
+# OBJ-NEXT:         AArch64 PAuth ABI core info: platform [[PLATFORM]], version [[VERSION]]
+# OBJ-NEXT:       ]
+# OBJ-NEXT:     }
+# OBJ-NEXT:   }
+# OBJ-NEXT: ]
+
+#--- gnu-0-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0-0.s -o gnu-0-0.o
+# RUN: llvm-readelf --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-0-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x0 (invalid)" -DVERSION=0x0 %s
+
+#--- gnu-1-0.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 1           // PAuth ABI platform
+  .quad 0           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-1-0.s -o gnu-1-0.o
+# RUN: llvm-readelf --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+# RUN: llvm-readobj --notes gnu-1-0.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x1 (baremetal)" -DVERSION=0x0 %s
+
+#--- gnu-0x10000002-85.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 85          // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-85.s -o gnu-0x10000002-85.o
+# RUN: llvm-readelf --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-85.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" \
+# RUN:   -DVERSION="0x55 (PointerAuthIntrinsics, !PointerAuthCalls, PointerAuthReturns, !PointerAuthAuthTraps, PointerAuthVTPtrAddressDiscrimination, !PointerAuthVTPtrTypeDiscrimination, PointerAuthInitFini)" %s
+
+#--- gnu-0x10000002-128.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 16          // Data size
+  .quad 0x10000002  // PAuth ABI platform
+  .quad 128         // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-0x10000002-128.s -o gnu-0x10000002-128.o
+# RUN: llvm-readelf --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=ELF -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+# RUN: llvm-readobj --notes gnu-0x10000002-128.o | \
+# RUN:   FileCheck --check-prefix=OBJ -DPLATFORM="0x10000002 (llvm_linux)" -DVERSION="0x80 (unknown)" %s
+
+#--- gnu-short.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 12          // Data size
+  .quad 42          // PAuth ABI platform
+  .word 1           // PAuth ABI version
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-short.s -o gnu-short.o
+# RUN: llvm-readelf --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+# RUN: llvm-readobj --notes gnu-short.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=28 -DDATASIZE=18 \
+# RUN:   -DERR="<corrupted size: expected 16, got 12>" %s
+
+# ELF-ERR: Displaying notes found in: .note.gnu.property
+# ELF-ERR-NEXT:   Owner                 Data size	Description
+# ELF-ERR-NEXT:   GNU                   0x000000[[DATASIZE]]	NT_GNU_PROPERTY_TYPE_0 (property note)
+# ELF-ERR-NEXT:   AArch64 PAuth ABI core info: [[ERR]]
+
+# OBJ-ERR:      Notes [
+# OBJ-ERR-NEXT:   NoteSection {
+# OBJ-ERR-NEXT:     Name: .note.gnu.property
+# OBJ-ERR-NEXT:     Offset: 0x40
+# OBJ-ERR-NEXT:     Size: 0x[[SIZE]]
+# OBJ-ERR-NEXT:     Note {
+# OBJ-ERR-NEXT:       Owner: GNU
+# OBJ-ERR-NEXT:       Data size: 0x[[DATASIZE]]
+# OBJ-ERR-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# OBJ-ERR-NEXT:       Property [
+# OBJ-ERR-NEXT:         AArch64 PAuth ABI core info: [[ERR]]
+# OBJ-ERR-NEXT:       ]
+# OBJ-ERR-NEXT:     }
+# OBJ-ERR-NEXT:   }
+# OBJ-ERR-NEXT: ]
+
+#--- gnu-long.s
+.section ".note.gnu.property", "a"
+  .long 4           // Name length is always 4 ("GNU")
+  .long end - begin // Data length
+  .long 5           // Type: NT_GNU_PROPERTY_TYPE_0
+  .asciz "GNU"      // Name
+  .p2align 3
+begin:
+  # PAuth ABI property note
+  .long 0xc0000001  // Type: GNU_PROPERTY_AARCH64_FEATURE_PAUTH
+  .long 24          // Data size
+  .quad 42          // PAuth ABI platform
+  .quad 1           // PAuth ABI version
+  .quad 0x0123456789ABCDEF
+  .p2align 3        // Align to 8 byte for 64 bit
+end:
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu gnu-long.s -o gnu-long.o
+# RUN: llvm-readelf --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=ELF-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
+# RUN: llvm-readobj --notes gnu-long.o | \
+# RUN:   FileCheck --check-prefix=OBJ-ERR -DSIZE=30 -DDATASIZE=20 \
+# RUN:   -DERR="<corrupted size: expected 16, got 24>" %s
diff --git a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
index 377e6f9..b517f0b 100644
--- a/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
+++ b/llvm/test/tools/llvm-readobj/ELF/AArch64/aarch64-note-gnu-property.s
@@ -1,3 +1,5 @@
+// See tests for GNU_PROPERTY_AARCH64_FEATURE_PAUTH in aarch64-feature-pauth.s
+
 // RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu %s -o %t
 // RUN: llvm-readelf --notes %t | FileCheck %s --check-prefix=GNU
 // RUN: llvm-readobj --notes %t | FileCheck %s --check-prefix=LLVM